我有一个pandas数据帧,我也已将其写入文件。我还为json格式的数据创建了一个模式。我将其存储为一个python字典,并将其写入文件。
我尝试过使用to_gpq
和命令行上传,在这两种情况下,我都会遇到一个错误,即有一个重复的字段,相同的字段。
这是关于数据的信息:
代码
df.shape
len(clinvar_variant_schema)
schema_fields = [x['name'] for x in clinvar_variant_schema]
schema_fields.sort()
json.dumps(schema_fields)
colnames = df.columns.tolist()
colnames.sort()
json.dumps(colnames)
set(schema_fields).difference(set(colnames))
set(colnames).difference(set(schema_fields))
输出
(1000, 24)
24
'["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "ALT", "CHROM", "CLNDISDB", "CLNDN", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNVC", "CLNVCSO", "CLNVI", "FILTER", "GENEINFO", "ID", "MC", "ORIGIN", "POS", "QUAL", "REF", "RS"]'
'["AF_ESP", "AF_EXAC", "AF_TGP", "ALLELEID", "ALT", "CHROM", "CLNDISDB", "CLNDN", "CLNHGVS", "CLNREVSTAT", "CLNSIG", "CLNSIGCONF", "CLNVC", "CLNVCSO", "CLNVI", "FILTER", "GENEINFO", "ID", "MC", "ORIGIN", "POS", "QUAL", "REF", "RS"]'
set()
set()
Colnames Schema_Names
0 AF_ESP AF_ESP
1 AF_EXAC AF_EXAC
2 AF_TGP AF_TGP
3 ALLELEID ALLELEID
4 ALT ALT
5 CHROM CHROM
6 CLNDISDB CLNDISDB
7 CLNDN CLNDN
8 CLNHGVS CLNHGVS
9 CLNREVSTAT CLNREVSTAT
10 CLNSIG CLNSIG
11 CLNSIGCONF CLNSIGCONF
12 CLNVC CLNVC
13 CLNVCSO CLNVCSO
14 CLNVI CLNVI
15 FILTER FILTER
16 GENEINFO GENEINFO
17 ID ID
18 MC MC
19 ORIGIN ORIGIN
20 POS POS
21 QUAL QUAL
22 REF REF
23 RS RS
使用熊猫
project_id = "my_project_id"
table_id = "my_dataset.clinvar_vcf"
df.to_gbq(
destination_table = table_id,
project_id = project_id,
if_exists = "replace",
table_schema = clinvar_variant_schema
)
IPython will make a temporary file named: /tmp/ipython_edit_pw1v55fy/ipython_edit_k3v1q7m5.py
1it [00:00, 2.93it/s]
---------------------------------------------------------------------------
InvalidResponse Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in load_table_from_file(self, file_obj, destination, rewind, size, num_retries, job_id, job_id_prefix, location, project, job_config)
1829 response = self._do_resumable_upload(
-> 1830 file_obj, job_resource, num_retries
1831 )
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in _do_resumable_upload(self, stream, metadata, num_retries)
2153 upload, transport = self._initiate_resumable_upload(
-> 2154 stream, metadata, num_retries
2155 )
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in _initiate_resumable_upload(self, stream, metadata, num_retries)
2195 upload.initiate(
-> 2196 transport, stream, metadata, _GENERIC_CONTENT_TYPE, stream_final=False
2197 )
/opt/conda/lib/python3.7/site-packages/google/resumable_media/requests/upload.py in initiate(self, transport, stream, metadata, content_type, total_bytes, stream_final, timeout)
411 )
--> 412 self._process_initiate_response(response)
413 return response
/opt/conda/lib/python3.7/site-packages/google/resumable_media/_upload.py in _process_initiate_response(self, response)
505 self._get_status_code,
--> 506 callback=self._make_invalid,
507 )
/opt/conda/lib/python3.7/site-packages/google/resumable_media/_helpers.py in require_status_code(response, status_codes, get_status_code, callback)
110 u"Expected one of",
--> 111 *status_codes
112 )
InvalidResponse: ('Request failed with status code', 400, 'Expected one of', <HTTPStatus.OK: 200>, <HTTPStatus.CREATED: 201>)
During handling of the above exception, another exception occurred:
BadRequest Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize, schema, progress_bar)
628 chunks = tqdm.tqdm(chunks)
--> 629 for remaining_rows in chunks:
630 logger.info(
/opt/conda/lib/python3.7/site-packages/tqdm/std.py in __iter__(self)
1173 try:
-> 1174 for obj in iterable:
1175 yield obj
/opt/conda/lib/python3.7/site-packages/pandas_gbq/load.py in load_chunks(client, dataframe, dataset_id, table_id, chunksize, schema, location)
81 job_config=job_config,
---> 82 location=location,
83 ).result()
/opt/conda/lib/python3.7/site-packages/google/cloud/bigquery/client.py in load_table_from_file(self, file_obj, destination, rewind, size, num_retries, job_id, job_id_prefix, location, project, job_config)
1836 except resumable_media.InvalidResponse as exc:
-> 1837 raise exceptions.from_http_response(exc.response)
1838
BadRequest: 400 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/eacri-genomics/jobs?uploadType=resumable: Cannot load CSV data with a repeated field. Field: FILTER
During handling of the above exception, another exception occurred:
GenericGBQException Traceback (most recent call last)
<ipython-input-55-19cb6dc0a4ee> in <module>
6 project_id = project_id,
7 if_exists = "replace",
----> 8 table_schema = clinvar_variant_schema
9 )
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in to_gbq(self, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials)
1655 location=location,
1656 progress_bar=progress_bar,
-> 1657 credentials=credentials,
1658 )
1659
/opt/conda/lib/python3.7/site-packages/pandas/io/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, verbose, private_key)
226 credentials=credentials,
227 verbose=verbose,
--> 228 private_key=private_key,
229 )
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in to_gbq(dataframe, destination_table, project_id, chunksize, reauth, if_exists, auth_local_webserver, table_schema, location, progress_bar, credentials, verbose, private_key)
1206 chunksize=chunksize,
1207 schema=table_schema,
-> 1208 progress_bar=progress_bar,
1209 )
1210
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in load_data(self, dataframe, dataset_id, table_id, chunksize, schema, progress_bar)
634 )
635 except self.http_error as ex:
--> 636 self.process_http_error(ex)
637
638 def schema(self, dataset_id, table_id):
/opt/conda/lib/python3.7/site-packages/pandas_gbq/gbq.py in process_http_error(ex)
433 # <https://cloud.google.com/bigquery/troubleshooting-errors>`__
434
--> 435 raise GenericGBQException("Reason: {0}".format(ex))
436
437 def run_query(
GenericGBQException: Reason: 400 POST https://bigquery.googleapis.com/upload/bigquery/v2/projects/eacri-genomics/jobs?uploadType=resumable: Cannot load CSV data with a repeated field. Field: FILTER
使用命令行bq
bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json my_project_id:my_dataset.clinvar_vcf clinvar_expanded_vcf
BigQuery error in load operation: Cannot load CSV data with a repeated field.
Field: FILTER
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
<ipython-input-45-be7b46a6694c> in <module>
----> 1 get_ipython().run_cell_magic('bash', '', 'bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json eacri-genomics:clinvar_9302020.clinvar_vcf clinvar_expanded_vcfn')
/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2379 with self.builtin_trap:
2380 args = (magic_arg_s, cell)
-> 2381 result = fn(*args, **kwargs)
2382 return result
2383
/opt/conda/lib/python3.7/site-packages/IPython/core/magics/script.py in named_script_magic(line, cell)
140 else:
141 line = script
--> 142 return self.shebang(line, cell)
143
144 # write a basic docstring:
<decorator-gen-103> in shebang(self, line, cell)
/opt/conda/lib/python3.7/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
185 # but it's overkill for just that one bit of state.
186 def magic_deco(arg):
--> 187 call = lambda f, *a, **k: f(*a, **k)
188
189 if callable(arg):
/opt/conda/lib/python3.7/site-packages/IPython/core/magics/script.py in shebang(self, line, cell)
243 sys.stderr.flush()
244 if args.raise_error and p.returncode!=0:
--> 245 raise CalledProcessError(p.returncode, cell, output=out, stderr=err)
246
247 def _run_script(self, p, cell, to_close):
CalledProcessError: Command 'b'bq load --source_format=CSV --field_delimiter=tab --replace --schema clinvar_variant_schema.json eacri-genomics:clinvar_9302020.clinvar_vcf clinvar_expanded_vcfn'' returned non-zero exit status 1.
确保clinvar_variant_schema
中没有'mode': 'REPEATED'
。还要检查表clinvar_vcf
是否已经存在于BigQuery及其列的模式中。
看起来CSV不支持嵌套或重复数据。
https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#limitations
我认为默认情况下to_gbq会转换为CSV,然后加载。因此,您可能希望使用CSV以外的其他格式。
建立在Sergey 的答案之上
你确实应该使用";重复的";模式
bq_schema = [
SchemaField(name="sp_zipcode", field_type="STRING", mode="REPEATED", description="Search query parameter: zipcode")
]
这仍然导致Cannot load CSV data with a repeated field. Field: sp_zipcode
这是通过升级需求为我解决的
pip install google-cloud-bigquery --upgrade
pip install pandas-gbq --upgrade
google-cloud-bigquery==2.32.0
pandas-gbq==0.17.0
以下是安装2个软件包后的整个pip冻结:
cachetools==5.0.0
certifi==2021.10.8
charset-normalizer==2.0.11
db-dtypes==0.3.1
google-api-core==2.4.0
google-auth==2.6.0
google-auth-oauthlib==0.4.6
google-cloud-bigquery==2.32.0
google-cloud-bigquery-storage==2.11.0
google-cloud-core==2.2.2
google-crc32c==1.3.0
google-resumable-media==2.1.0
googleapis-common-protos==1.54.0
grpcio==1.43.0
grpcio-status==1.43.0
idna==3.3
libcst==0.4.1
mypy-extensions==0.4.3
numpy==1.21.5
oauthlib==3.2.0
packaging==21.3
pandas==1.3.5
pandas-gbq==0.17.0
proto-plus==1.19.9
protobuf==3.19.4
pyarrow==6.0.1
pyasn1==0.4.8
pyasn1-modules==0.2.8
pydata-google-auth==1.3.0
pyparsing==3.0.7
python-dateutil==2.8.2
pytz==2021.3
PyYAML==6.0
requests==2.27.1
requests-oauthlib==1.3.1
rsa==4.8
six==1.16.0
typing-extensions==4.0.1
typing-inspect==0.7.1
urllib3==1.26.8