Hi Forest,
Thank you so much for your help! I did end up using the data_matching branch and it worked pretty well.
I had to replace the greedyMatching function in the clustering module to get things to work. I'm not sure if this is a bug or if I did something incorrectly that created the problem. It looks like the greedyMatching function is presuming that the first vertex of each pair in dupes comes from dataset 0 and the second comes from the other dataset. This wasn't true for me, so the clusters it returned included repeats like (1,2) and (2,1).
I replaced
def greedyMatching(dupes, threshold=0.5):
covered_vertex_A = set([])
covered_vertex_B = set([])
clusters = []
sorted_dupes = sorted(dupes, key=lambda score: score[1], reverse=True)
dupes_list = [dupe for dupe in sorted_dupes if dupe[1] >= threshold]
for dupe in dupes_list:
vertices = dupe[0]
if vertices[0] not in covered_vertex_A and vertices[1] not in covered_vertex_B:
clusters.append(set(vertices))
covered_vertex_A.update([vertices[0]])
covered_vertex_B.update([vertices[1]])
return clusters
with
def greedyMatching(dupes, threshold=0.5):
covered_vertex = set([])
clusters = []
sorted_dupes = sorted(dupes, key=lambda score: score[1], reverse=True)
dupes_list = [dupe for dupe in sorted_dupes if dupe[1] >= threshold]
for dupe in dupes_list:
vertices = dupe[0]
if vertices[0] not in covered_vertex and vertices[1] not in covered_vertex:
clusters.append(set(vertices))
covered_vertex.update([vertices[0]])
covered_vertex.update([vertices[1]])
return clusters