import pymongo, gzip
src_file_path = input()
client = pymongo.MongoClient(compressors='zstd')
vcf_db = client.vcf_db
vcf_coll = vcf_db.vcf_coll
with gzip.open(src_file_path, mode = 'rt') as src_file_opened:
for line in src_file_opened:
if line.startswith('##'):
continue
header_row = line.split('\n')[0].split('\t')
break
fragment, fragment_len, added_fragment_num = [], 0, 0
for line in src_file_opened:
row = line.split('\n')[0].split('\t')
fragment.append(dict(zip(header_row, row)))
fragment_len += 1
if fragment_len == 10000:
vcf_coll.insert_many(fragment)
added_fragment_num += 1
fragment.clear()
fragment_len = 0
if fragment_len > 0:
vcf_coll.insert_many(fragment)
added_fragment_num += 1--
You received this message because you are subscribed to the Google Groups "mongodb-user"
group.
For other MongoDB technical support options, see: https://docs.mongodb.com/manual/support/
---
You received this message because you are subscribed to the Google Groups "mongodb-user" group.
To unsubscribe from this group and stop receiving emails from it, send an email to mongodb-user...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/mongodb-user/24a97493-aa45-4b93-88fe-518aea037d48%40googlegroups.com.
To unsubscribe from this group and stop receiving emails from it, send an email to mongod...@googlegroups.com.
$ zcat ALL.chr6_GRCh38.genotypes.20170504.vcf.gz | sed -n '131p;132p;133p;134q' > test2.csv
$ tar -cvzf test2.tar.gz test2.csv
$ cat modified_code.py# This code is expecting Python 3. You need to install compatible# modules with pip3.#import pymongo, gzipfrom pymongo import MongoClientsrc_file_path = input()
client = MongoClient(compressors='zstd')vcf_db = client.platonb_dbvcf_coll = vcf_db['vcf1']
with gzip.open(src_file_path, mode = 'rt') as src_file_opened: for line in src_file_opened: if line.startswith('##'): continue header_row = line.split('\n')[0].split('\t') break
fragment, fragment_len, added_fragment_num = [], 0, 0 for line in src_file_opened: row = line.split('\n')[0].split('\t') fragment.append(dict(zip(header_row, row))) fragment_len += 1 if fragment_len == 10000: vcf_coll.insert_many(fragment) added_fragment_num += 1 fragment.clear() fragment_len = 0 if fragment_len > 0: vcf_coll.insert_many(fragment) added_fragment_num += 1
$ python3 modified_code.py
test2.tar.gz
Traceback (most recent call last):
File "modified_code.py", line 30, in <module>
vcf_coll.insert_many(fragment)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/collection.py", line 758, in insert_many
blk.execute(write_concern, session=session)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/bulk.py", line 511, in execute
return self.execute_command(generator, write_concern, session)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/bulk.py", line 346, in execute_command
self.is_retryable, retryable_bulk, s, self)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/mongo_client.py", line 1385, in _retry_with_session
return func(session, sock_info, retryable)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/bulk.py", line 341, in retryable_bulk
retryable, full_result)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/bulk.py", line 295, in _execute_command
result, to_send = bwc.execute(ops, client)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/message.py", line 895, in execute
request_id, msg, to_send = self._batch_command(docs)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/message.py", line 889, in _batch_command
self.codec, self)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/message.py", line 1376, in _do_bulk_write_command
namespace, operation, command, docs, check_keys, opts, ctx)
File "/home/vagrant/.local/lib/python3.6/site-packages/pymongo/message.py", line 1301, in _do_batched_op_msg
operation, command, docs, check_keys, ack, opts, ctx)
bson.errors.InvalidDocument: Key names must not contain the NULL byte
thgenomes_coll.find_one({'ID': 'rs544472365'})Exactly after placing the 127th 10000-rows-fragment, Maybe it’s a bug?
Hi,
I managed to reproduce the issue that you’re seeing and submitted a PyMongo bug report PYTHON-2055. The MongoDB Python driver team has released a fix in PyMongo v3.10 to address the issue.
The problem was that PyMongo does not factor in the 16 byte message header when batching a compressed bulk write OP_MSG. The 127th of 10K rows batch size of your input file was 9 bytes short of the OP_MSG compressed limit of 48MB, which managed to trigger this bug. If you were to reduce the size of the rows, or the documents were smaller in size you won’t be able to trigger the bug.
Regards,
Wan.