I am putting together a script for deleting duplicate records for mongodb. Seem to be having an issue where it deletes too many records. I have a million records with 300 words in each record. Any Help would be greatly appreciated.
import pymongo
# from bson.objectid import ObjectId
# import ipdb
""" USER INPUTS """
db_name = "test'
collection = "raw_data"
""" USER INPUTS """
client = pymongo.MongoClient()
db = client[db_name]
collection = db[collection]
def query_all_obj_ids(collection=collection, field="_id"):
field_str_list = []
items = [i for i in collection.find({}, {field: True})]
for i in items:
for k, v in i.items():
field_str_list.append(v)
del items
return field_str_list
def flip_keys_values(collection=collection, field="_id", field2="content"):
master_dict = {}
items = [i for i in collection.find({}, {field: True, field2: True})]
for i in items:
for k, v in i.items():
if k == "_id":
flip_key = v
if k == "content":
flip_value = v[1000:1100].encode('ascii', errors='ignore')
master_dict[flip_value] = flip_key
return master_dict
def return_obj_id_to_remove(d, orig_obj_list):
current_set = set(orig_obj_list)
remaining_obj = []
for k, v in d.items():
remaining_obj.append(v)
token_one_lst = set(remaining_obj)
new_list = []
for i, x in enumerate(current_set):
if x not in token_one_lst:
new_list.append(x)
return new_list
def remove_record(list_of_docs, collection=collection):
collection.remove({'_id': {'$in': list_of_docs}})
obj_ids = query_all_obj_ids()
print "objects queried"
flip_dict = flip_keys_values()
print "flipped dictionary complete"
remove_list = return_obj_id_to_remove(flip_dict, obj_ids)
print "num of objectIds to remove"
print(sum(1 for x in remove_list))
del_obj_ids = remove_record(remove_list)
#print(sum(1 for rm in del_obj_ids), "records deleted")