Here is the traceback
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/econ/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/home/ubuntu/anaconda3/envs/econ/lib/python3.7/multiprocessing/pool.py", line 47, in starmapstar
return list(itertools.starmap(args[0], args[1]))
File "analysis_gensim.py", line 318, in get_stats_for_date
tfidf_model, threshold=threshold)
File "analysis_gensim.py", line 113, in get_similar_articles
for similarities in index[unclustered_articles_vec]:
File "/home/ubuntu/anaconda3/envs/econ/lib/python3.7/site-packages/gensim/similarities/docsim.py", line 513, in __getitem__
self.close_shard() # no-op if no documents added to index since last query
File "/home/ubuntu/anaconda3/envs/econ/lib/python3.7/site-packages/gensim/similarities/docsim.py", line 436, in close_shard
shard = Shard(self.shardid2filename(shardid), index)
File "/home/ubuntu/anaconda3/envs/econ/lib/python3.7/site-packages/gensim/similarities/docsim.py", line 118, in __init__
self.index = self.get_index()
File "/home/ubuntu/anaconda3/envs/econ/lib/python3.7/site-packages/gensim/similarities/docsim.py", line 164, in get_index
self.index = self.cls.load(self.fullname(), mmap='r')
File "/home/ubuntu/anaconda3/envs/econ/lib/python3.7/site-packages/gensim/utils.py", line 426, in load
obj = unpickle(fname)
File "/home/ubuntu/anaconda3/envs/econ/lib/python3.7/site-packages/gensim/utils.py", line 1384, in unpickle
return _pickle.load(f, encoding='latin1')
_pickle.UnpicklingError: invalid load key, '\x00'.
Here is the code to function that throws error.
def get_similar_articles(unclustered_articles, articles_day2, dct, tfidf_model, threshold=0.3, diff_source=True):
"""
The method returns all the articles from articles1 which have atleast one
article from articles2 where the cosine similarity is more than threshold.
Parameters
----------
@unclusterd_articles: list of Articles, which are not in any cluster
@articles_day2: list of Articles, from next day
@dct: A gensim Dictionary object, the bag of words model for corpus
@tfidf_model: gensim tfidf model
@threshold: int, threshold for similarity
@diff_source: boolean, whether cluster from next day should be from same source
or different
Returns
-------
list of articles from articles1 that are similar to articles from
articles2 and have different news source
"""
index_tmpfile = get_tmpfile("index_new")
similar_articles = []
index = Similarity(index_tmpfile, tfidf_model[iter(BoWIter(dct, articles_day2))], num_features=len(dct))
# index = MatrixSimilarity(tfidf_model[iter(BoWIter(dct, articles_day2))])
unclustered_articles_vec = tfidf_model[iter(BoWIter(dct, unclustered_articles))]
# if we only want diff source articles cluster, we need to calculate at what indices
# same source news occurs so that it similarities at these indices can be masked
if diff_source:
indices_of_same_source = get_pos_of_same_source_news(unclustered_articles, articles_day2)
idx = 0
for similarities in index[unclustered_articles_vec]:
# check that there is atleast one element such that similarity of ith article
# is more than 0.3, if so ith article is in cluster with atleast one article
# from articles2
similarities = np.array(similarities)
# mask all the similarities where articles have some source
# since we only want to know unclustered articles forming cluster
# from next day articles but different source
if diff_source and len(indices_of_same_source[idx]) != 0:
indices_of_same_source_i = indices_of_same_source[idx]
assert (len(similarities >= len(indices_of_same_source_i)))
assert (len(similarities) > max(indices_of_same_source_i))
similarities[indices_of_same_source_i] = 0
if np.count_nonzero(similarities > threshold) > 0:
similar_articles += [idx]
idx += 1
return similar_articles