from gensim.models.doc2vec import Doc2Vecfrom collections import namedtupleimport numpy as npimport randomrandom.seed(42)np.random.seed(42)sen = [('This is a sentence', 1)]TaggedDocument = namedtuple('TaggedDocument', 'words tags')sentences = [TaggedDocument(d.split(), [c]) for d, c in sen]model = Doc2Vec(alpha=0.025, size=300, min_alpha=0.025, min_count=1, dm=0, seed=1, workers=1)model.build_vocab(sentences)for epoch in range(10):model.train(sentences)X = model.docvecs.doctag_syn0
model = gensim.models.Word2Vec(sentences,min_count=1,size=300,sg=1)
model2 = gensim.models.Word2Vec(sentences,min_count=1,size=300,sg=1)
print(model.wv['grAma~m'])
print(model2.wv['grAma~m'])
My outputs:
[-1.15209408e-01 5.66907644e-01 6.62487373e-02 -1.92277774e-01
5.95047951e-01 2.65130579e-01 -2.85124362e-01 -2.33701225e-02
-3.16529423e-02 -7.85982981e-02 1.58451021e-01 3.69128048e-01
-2.09954455e-01 -5.37859797e-01 2.92117268e-01 -3.05241317e-01
-5.30922115e-01 1.52387440e-01 1.28460631e-01 -2.25819230e-01
-6.83098257e-01 5.30690663e-02 -2.63407916e-01 3.50371033e-01
-3.28744091e-02 -4.24932223e-03 2.43359104e-01 1.80333003e-01
1.61999851e-01 -1.71429545e-01 -3.94135237e-01 -5.83330929e-01
5.62174797e-01 -8.26755762e-02 -6.39443398e-01 -3.13248903e-01
1.03685297e-01 -6.63443863e-01 3.02115589e-01 -4.61086273e-01
-6.08286500e-01 -1.20102964e-01 -3.00380737e-01 -4.38583970e-01
3.37147623e-01 5.86276114e-01 3.64195019e-01 -2.20914796e-01
3.70045513e-01 -8.77974704e-02 -5.53453028e-01 -8.81938487e-02
-9.76282812e-04 -2.58361727e-01 3.93593609e-01 -4.85125750e-01
-1.80572316e-01 -5.72446048e-01 -2.98964828e-01 1.22440159e-01
-3.94044340e-01 6.97918534e-01 -1.89129874e-01 4.18169558e-01
6.44118607e-01 -5.92591837e-02 -8.40789735e-01 3.06796193e-01
2.87852496e-01 -5.14767110e-01 -3.46747220e-01 -4.80065569e-02
1.54193714e-01 4.30460662e-01 7.92562187e-01 -2.27455750e-01
1.85075074e-01 1.05546251e-01 2.72004843e-01 3.51823151e-01
6.38619304e-01 2.28565618e-01 -2.82243758e-01 -7.53763914e-01
-1.75559893e-01 -1.78303003e-01 1.33382440e-01 1.07382059e+00
1.76721781e-01 -4.91557986e-01 1.37768686e-01 7.26342155e-03
-7.22404122e-01 3.96043152e-01 -1.46641195e-01 -1.03636058e-02
-5.06546684e-02 1.14440799e-01 7.53895417e-02 1.38904855e-01
6.61876857e-01 -5.82426727e-01 -5.79077959e-01 1.83478128e-02
7.77507341e-03 -2.26008892e-01 2.21303284e-01 2.89737731e-01
4.14494038e-01 8.99366736e-02 2.16657342e-03 -3.95772666e-01
-5.44024467e-01 3.56425732e-01 2.59335399e-01 3.40655327e-01
-6.12785637e-01 -4.97648686e-01 2.43335217e-01 5.74765265e-01
8.65826964e-01 4.35946226e-01 -5.08687973e-01 9.75170016e-01
3.74363750e-01 -2.11352438e-01 9.73102748e-02 6.15456045e-01
2.04090234e-02 -5.12974083e-01 -2.51947433e-01 -5.07790685e-01
-6.34955615e-03 5.10106608e-02 -5.96850626e-02 6.24946117e-01
-8.85338262e-02 -2.23340914e-01 5.28660357e-01 2.43490919e-01
-5.61996072e-04 -1.23533502e-01 -2.77670175e-01 2.10991606e-01
-5.51608384e-01 3.72584872e-02 1.12767577e-01 -5.79519391e-01
-4.28734481e-01 1.80447549e-01 1.19643882e-01 -3.38991396e-02
7.80035704e-02 -6.57083154e-01 -1.89128429e-01 -1.65244922e-01
-4.45333064e-01 -4.24145341e-01 1.97385341e-01 -1.55233994e-01
1.26955867e-01 2.12779306e-02 -1.37888819e-01 -1.03730105e-01
8.01555514e-01 5.36681890e-01 1.60923019e-01 2.79566288e-01
-8.29707146e-01 3.14042211e-01 -4.49518025e-01 -1.06513634e-01
-8.96162912e-03 5.70015192e-01 -2.82151639e-01 -3.70363533e-01
2.64375210e-01 -2.79473603e-01 6.62168682e-01 -3.45986247e-01
4.25149500e-01 1.85147703e-01 -4.86735165e-01 1.72697544e-01
-4.94701378e-02 4.78146195e-01 -1.77817672e-01 1.91457346e-01
1.19075626e-01 -3.48942786e-01 1.32155821e-01 -7.82056153e-01
1.20587079e-02 3.73162776e-01 9.26572680e-02 -7.07513392e-01
1.20747834e-02 7.50622079e-02 2.23509341e-01 3.65100920e-01
-5.09871721e-01 -7.27363646e-01 -1.54160857e-01 1.77619383e-01
6.39735386e-02 2.56710947e-01 -1.91863716e-01 -4.27002043e-01
-2.28972837e-01 -2.27127038e-02 -5.24550319e-01 7.73877203e-01
7.47188833e-03 1.05883971e-01 2.12711245e-01 -7.14540005e-01
3.71597111e-01 -3.40568602e-01 -5.36585152e-01 -3.33957258e-03
-5.30181192e-02 -4.82758507e-02 4.56799716e-01 3.20759714e-01
-1.57734185e-01 2.34588772e-01 3.84030312e-01 2.35618919e-01
9.28799529e-03 -2.23516300e-01 -4.06400055e-01 4.99487519e-01
-9.55606624e-02 2.08300158e-01 3.37629735e-01 -1.57900840e-01
-2.22409531e-01 -3.80281746e-01 1.56717405e-01 -8.15897405e-01
-1.88742116e-01 7.44439662e-01 -7.55298853e-01 6.24962330e-01
-3.25300395e-01 -3.18593919e-01 5.10223985e-01 1.93855986e-02
-2.09270921e-02 -1.46675721e-01 -1.00254126e-01 -7.87758887e-01
1.90163165e-01 5.12366951e-01 2.37174630e-02 -3.80647570e-01
2.00150892e-01 -1.83827490e-01 9.91053954e-02 -1.39361098e-01
7.45765418e-02 -5.01441181e-01 -2.78989524e-01 -6.13046344e-03
-8.45106065e-01 4.84382272e-01 -5.92408121e-01 -3.28444242e-01
2.44598147e-02 -3.37925434e-01 2.56334931e-01 -3.80912274e-01
1.62529990e-01 -4.55515683e-01 -1.96299970e-01 1.24192707e-01
-2.53181487e-01 -1.69312134e-01 2.17786446e-01 -5.32963037e-01
-3.76187503e-01 -1.07444242e-01 -1.85742676e-01 -2.97662914e-01
-5.89023054e-01 9.71629739e-01 2.32660338e-01 -4.91115563e-02
-2.23850682e-02 -5.88014007e-01 5.91092944e-01 -6.22020662e-01
-6.88432902e-02 -8.87100548e-02 -6.42245591e-01 6.67270124e-02
-2.57222414e-01 2.72782028e-01 5.56895286e-02 5.83783425e-02]
[-3.87221277e-02 4.52339530e-01 1.09868072e-01 -1.72092661e-01
4.97765332e-01 2.50845969e-01 -3.89112175e-01 1.72745399e-02
-2.21896768e-02 6.69194311e-02 1.12170547e-01 3.87466311e-01
-1.65716574e-01 -5.36912560e-01 2.56292522e-01 -3.26979369e-01
-3.98963064e-01 3.55675630e-02 2.39766553e-01 -1.63069651e-01
-5.96925557e-01 -9.02971178e-02 -3.42735410e-01 3.21216226e-01
-6.75370991e-02 9.72677767e-03 1.61467955e-01 2.65246987e-01
1.07292525e-01 -1.00848734e-01 -3.52842540e-01 -6.22156978e-01
6.01520896e-01 1.93421589e-03 -5.96066952e-01 -3.22146893e-01
6.84326440e-02 -6.87198281e-01 3.03438634e-01 -4.55634862e-01
-4.97346938e-01 -7.05456808e-02 -2.64652401e-01 -4.09175634e-01
3.82603854e-01 6.09255612e-01 3.73759061e-01 -2.77587175e-01
4.10511136e-01 -2.27908283e-01 -5.97417057e-01 -1.04541428e-01
-1.50434589e-02 -2.96758622e-01 3.64990801e-01 -4.41378981e-01
-1.69495746e-01 -5.96067905e-01 -2.23689407e-01 1.25682905e-01
-4.03957486e-01 6.20783806e-01 -2.15837747e-01 5.45356750e-01
6.38140321e-01 2.76319273e-02 -6.69643939e-01 3.75650674e-01
2.24170029e-01 -5.01874924e-01 -3.62053126e-01 -4.92009446e-02
4.54862937e-02 4.04625684e-01 7.85717010e-01 -2.84052312e-01
2.86554575e-01 2.48921707e-01 2.64979184e-01 3.08981538e-01
4.56739753e-01 3.22332412e-01 -2.45915383e-01 -7.76285827e-01
-1.99136749e-01 -1.66493654e-01 1.49698213e-01 9.18897927e-01
1.48379043e-01 -4.94304210e-01 8.37039798e-02 6.96300194e-02
-7.25678205e-01 3.74966562e-01 -1.00565374e-01 -2.58393679e-02
-9.04325470e-02 8.43370780e-02 6.42976090e-02 8.19391832e-02
6.03102624e-01 -5.07758379e-01 -5.52212119e-01 7.76851326e-02
9.15969387e-02 -2.48513430e-01 3.12966764e-01 3.47672760e-01
4.52693254e-01 1.86156034e-02 2.61147469e-02 -4.11926091e-01
-4.63104665e-01 2.89903253e-01 3.27399701e-01 3.55564564e-01
-6.28267348e-01 -4.72160071e-01 1.61588609e-01 5.48257709e-01
8.54648471e-01 3.29813391e-01 -5.41681468e-01 9.27353919e-01
3.56294364e-01 -1.88417494e-01 1.21452965e-01 5.13853431e-01
-4.81690988e-02 -4.80037838e-01 -2.45594278e-01 -4.58171874e-01
-9.28729996e-02 2.35864576e-02 -1.52121782e-01 6.12150669e-01
-1.25474334e-01 -1.29433289e-01 5.69762886e-01 2.74020851e-01
-8.90769809e-02 -9.27427113e-02 -3.32992882e-01 2.61331826e-01
-4.90069658e-01 7.59906089e-03 8.86210054e-02 -5.79834700e-01
-4.45501417e-01 2.35578641e-01 5.06468378e-02 -5.48312068e-02
9.38242748e-02 -6.92066073e-01 -2.51835227e-01 -8.02928060e-02
-4.96311545e-01 -3.57998788e-01 1.10312410e-01 -2.00665906e-01
7.64842778e-02 -4.60533872e-02 -1.57822177e-01 -2.74655130e-02
7.03948319e-01 4.92792875e-01 1.44901484e-01 3.37659985e-01
-8.89326513e-01 2.88335711e-01 -5.38539827e-01 5.27884671e-03
-4.42628115e-02 5.77439845e-01 -4.00747776e-01 -4.03252572e-01
2.92879939e-01 -2.48646215e-01 8.25638950e-01 -3.50348234e-01
5.08177698e-01 1.46836087e-01 -5.68441272e-01 2.72928387e-01
-2.38893321e-03 5.44370294e-01 -1.62190974e-01 1.22560307e-01
1.51426062e-01 -1.56004280e-01 6.95506111e-02 -8.05210292e-01
1.00371304e-04 2.97167689e-01 -1.11516491e-02 -6.82561040e-01
9.70369354e-02 1.49384335e-01 1.78279072e-01 5.02900183e-01
-6.40465975e-01 -6.75922275e-01 -2.34506503e-01 1.44189879e-01
2.02605829e-01 2.81474143e-01 -4.33670916e-02 -5.51946878e-01
-1.12937927e-01 -1.42783612e-01 -4.62246388e-01 8.70414019e-01
-7.11515546e-02 7.52391815e-02 2.47712165e-01 -7.46838987e-01
4.22199070e-01 -3.30155522e-01 -6.06278062e-01 4.41873120e-03
-5.10728061e-02 4.01204601e-02 3.48122269e-01 2.62465447e-01
-1.40936404e-01 1.93640471e-01 4.92829084e-01 2.40207240e-01
7.65149593e-02 -1.95094898e-01 -4.52034831e-01 5.04637837e-01
-1.06608078e-01 2.46136010e-01 2.02595502e-01 -9.38938186e-02
-2.01715589e-01 -3.25428814e-01 6.11635633e-02 -9.20903444e-01
-1.94872573e-01 8.03064704e-01 -7.08780169e-01 5.57815850e-01
-3.87972891e-01 -3.41693252e-01 5.14186919e-01 7.12527893e-03
-1.35374933e-01 -1.09157883e-01 -9.30306390e-02 -8.08228254e-01
1.71786442e-01 5.75264871e-01 5.02740440e-04 -3.12858850e-01
1.30191788e-01 -1.53775558e-01 8.24414715e-02 -1.85792685e-01
6.92034960e-02 -5.40443778e-01 -2.92089522e-01 -3.79952006e-02
-9.56484616e-01 4.48524714e-01 -5.19042194e-01 -5.06796658e-01
3.34648639e-02 -4.37479317e-01 1.55028895e-01 -3.32068712e-01
2.37167120e-01 -3.72257382e-01 -2.13770315e-01 1.16787195e-01
-1.70279935e-01 -1.84090868e-01 1.77147359e-01 -4.59143609e-01
-3.30913484e-01 -8.55064169e-02 -1.64870620e-01 -2.16274247e-01
-6.11448824e-01 9.69181418e-01 1.58873335e-01 2.09571850e-02
-1.57715138e-02 -5.76565623e-01 5.47562659e-01 -5.60898781e-01
-2.98905708e-02 -9.39129386e-03 -6.73840165e-01 1.19145602e-01
-2.49688298e-01 2.53679484e-01 1.05564006e-01 1.30332708e-01]



--
You received this message because you are subscribed to the Google Groups "gensim" group.
To unsubscribe from this group and stop receiving emails from it, send an email to gensim+un...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.
So far:I have a corpus in Telugu language, for which I have extract word vectors. I have used the simple_processing function from the gensim utils to tokenize the corpus line by line, before feeding it into the word2vec model. The model is defined with the following configurations: sg=1, min_count=1, workers=1, size=300.I have defined another model in the same code, with the same configurations.Upon execution, I am able to get identical vectors for a word across both the models, but there is significant difference between the vectors in one run to vectors in another.Also, when testing for most similar words of a particular token, I have obtained results that are like so:These results look to be consistent, but the vectors for the token are not as similar, from one run to another.I am unable to determine if the model is good enough and if it makes sense?P.S Thank you for your help in this
To unsubscribe from this group and stop receiving emails from it, send an email to gensim+unsubscribe@googlegroups.com.
To unsubscribe from this group and stop receiving emails from it, send an email to gensim+un...@googlegroups.com.
from gensim import models
# INPUT PROCESSING
ip = open("telugu_out.txt", "r")
sentences = []
for line in ip:
values = line.split()
sentences.append(values)
ip.close()
# CREATE SKIP-GRAM MODELS with GENSIM
# model_one = models.Word2Vec(sentences, min_count=1, size=300, sg=1, sorted_vocab=1)
model_one = models.word2vec.Word2Vec(sentences, window=2, min_count=1, size=300, sg=1, workers=1, seed=1)
model_two = models.word2vec.Word2Vec(sentences, window=2, min_count=1, size=300, sg=1, workers=1, seed=1)
words_one = list(model_one.wv.vocab)
words_two = list(model_two.wv.vocab)
# SORT THE VOCAB AFTER A RUN -> Take note of how many words and how they appear
wp = open("wordslist.txt", "w")
for item in words_one:
wp.write("%s\n" % item)
wp.close()
# Checking Output vectors
op = open("test_output.txt", "w")
op.write("%s\n" % words_one[1])
op.write("%s\n" % model_one[words_one[1]])
op.write("%s\n" % words_two[1])
op.write("%s\n" % model_two[words_two[1]])
op.close()
# OUTPUT OF MOST SIMILAR WORDS FOR GIVEN WORD
l1 = model_one.most_similar(positive=words_one[1])
l2 = model_two.most_similar(positive=words_two[1])
for item in l1:
print(item)
print("Demarcation")
for item in l2:
print(item)[ 0.9958523 -0.47147197 0.65097 -0.24571285 -0.6058898 0.9070413
0.3259044 0.13290858 0.44459316 0.10146513 0.06157693 0.3035734
0.14823852 0.02385972 0.16782278 0.52277607 -1.1638027 0.2979827
0.1220865 0.16933802 -0.03460348 0.24263206 0.34706536 -0.56511396
0.67618394 -0.30988303 -0.4235277 -0.09398667 -0.4435567 -0.15173244
0.6324383 -0.30855298 -0.08435892 0.02250796 0.11734334 -0.2920548
-0.2680047 0.6245682 -0.01686861 0.21044953 0.1471096 0.00934904
0.666177 -0.3262147 -0.01290315 -0.00998807 0.10178193 -0.28153947
-0.45705166 -0.4108727 -0.04170163 0.43871087 -0.814259 -0.13618213
0.40693387 -0.46396148 -0.7558191 -0.07804684 -0.43436632 -0.55409837
0.21179862 -0.5764219 0.07270633 0.6497293 -0.2868497 0.9022261
-0.4076771 -0.2579078 -0.3264665 -0.15014352 -0.87714475 -0.6537882
0.43984136 -0.7327631 0.18788919 0.26774424 0.20443107 0.10700845
0.45456603 -0.21833768 -0.17108688 0.00537629 -0.45853543 0.07400348
-0.3680166 -0.44134113 0.30072883 0.42631766 -0.43157718 0.27526414
-0.22984958 -0.35696372 -0.14902666 0.46621153 0.11914957 0.28216904
0.47083223 0.10614477 0.5076535 0.6705617 0.61957324 -0.00676489
0.64673895 0.47865516 -0.20712802 -0.47122997 -0.20498651 0.06982602
0.08606695 -0.6629306 0.1893502 -0.58849186 -0.01281098 -1.0607142
0.27090773 -0.11130068 0.37512362 -0.02278676 -0.36335462 -1.1685641
-0.5993051 0.05673279 0.29358014 -0.10465646 -0.02759874 -0.16952899
-0.83593273 -0.69758373 -0.17335027 -0.3232953 -0.15813743 0.46784985
0.4990978 0.06024883 0.04660138 0.4363617 0.02299269 0.4634255
0.32902834 -0.18921866 0.1855451 0.12634681 -0.5067104 0.12577964
0.19223744 0.04577246 0.02735131 0.71163076 -0.5278348 0.44749463
0.376945 -0.16611704 0.03827371 0.17791441 -0.3685352 -1.3959688
0.34282744 -0.038228 -0.1946164 0.03619479 -0.38001496 0.29347405
-0.52058953 -0.51150197 -0.40627834 0.03738808 0.05235135 0.27088434
0.04128829 -0.13224277 -0.5870281 0.4888212 -0.5013553 -0.33840325
-0.04667711 0.42595178 -0.08512034 0.46078688 0.06958587 0.21774518
-0.5357528 -0.55441976 0.9200193 -0.25966394 -0.07128505 0.1867822
-0.78847015 0.3911397 0.20001902 0.5168435 -0.13213162 -0.41476426
0.40880403 -0.70472556 -0.3019846 -0.73301136 -0.4841682 -0.4071299
-0.06373608 -0.04890923 0.19810455 -0.28860182 0.1262603 0.5076885
0.38994458 0.09718692 -0.11097336 0.5087467 -0.7239627 -0.29602984
0.08637185 0.09352452 -0.04647732 -0.23659179 -0.5040831 -0.42176422
-0.25792438 0.6347626 0.30564275 -0.26873258 -0.10156465 -0.27756235
0.31174725 -0.1667052 0.5050408 -0.04702962 -0.37914947 0.05371798
-0.00253894 0.72906876 -0.7353431 -0.14250167 -0.98796415 0.23861283
0.06502765 -0.01398402 -0.5377197 0.59139574 -0.11160891 -0.17679025
0.34187803 0.3842313 -0.00428242 0.09069498 0.27035782 -0.18769631
0.15596582 0.29484943 -0.53860074 -0.3625199 -1.2223799 0.04406869
0.00675854 -0.4329025 0.28991264 0.68014264 -0.31510666 -0.30929586
0.44640207 -0.0441399 0.48317346 0.02638739 -0.02305605 0.40699807
0.75289243 -0.21691522 -0.3238078 0.03806297 0.06161208 -0.39349136
-0.1450328 0.65663683 0.25388828 -0.62341154 0.19427982 -0.43555006
0.06653494 -0.14870289 -0.6779405 0.41561383 -0.4769667 -0.24106765
-0.1633696 0.04095063 0.32103616 0.2734562 -0.15015554 0.09937938
0.24794115 0.08665254 0.2168162 -0.42281997 -0.074343 0.2328754
-0.30168355 -0.24085407 -0.2957521 0.43837595 -0.04217354 0.43605956]
Run 2:
[-0.70802915 0.02178197 -0.24836464 -0.7085793 0.3307285 0.41997603
-0.40124145 -0.21172026 -0.16139644 0.40721005 0.1280731 -0.1862186
-0.33455566 0.19619231 -0.16707848 -0.13550064 0.6901927 -0.13745967
0.7240471 0.25290218 -0.06697398 -0.20271726 0.07620625 -0.74574983
0.44273898 0.23671533 -0.18539037 -0.06422722 0.60258627 -0.26088896
-0.066155 -0.5878811 0.108256 0.18473758 -0.18306291 -0.5205288
-0.35230818 -0.2544013 0.66695863 -0.7109937 0.35836303 -0.5460816
0.6391165 -0.0206544 0.33236915 -0.34127194 -0.26732343 -0.47866714
0.25147748 -0.62451476 0.15416828 -0.6398328 0.55898935 -0.5082061
-0.0023218 0.00391006 0.5080834 0.13441105 0.05689903 -0.15912692
0.86899453 -0.25499666 -0.03997678 -0.30265212 -0.4177816 -0.20656629
0.300017 0.22611795 0.00223294 0.15069063 0.27369097 0.14884958
0.4445092 0.69112104 0.01326024 0.6787312 -0.12909661 0.10073176
0.92663676 -0.09726813 -0.15453792 0.14434153 -0.09505139 0.01748092
-0.3301809 -0.6079124 0.3033795 0.7116785 -0.6909506 -0.4380965
0.35830197 -0.05206834 -0.34894907 -0.20947605 -0.19848458 0.3959289
0.95797676 0.34926414 -0.1437703 -0.04847232 0.09656153 -0.11927281
0.57536995 -0.05895678 -0.6615249 -0.48643702 -0.08489993 1.0750793
0.5762222 0.34664682 0.4364199 0.14612797 -0.01005667 0.2978207
-0.2521877 0.23086436 0.0018846 0.5032702 -0.35087502 -0.56413937
0.12209958 1.0292757 -0.60851747 0.14637563 0.44303972 0.10848483
0.25018087 0.57501405 -0.45176128 -0.70547354 0.2848835 -0.09308159
0.47957882 -0.26920068 -0.3341464 -0.338714 0.22862048 0.5406842
0.07672342 0.26874578 0.2840433 -0.59559035 0.12048131 0.56039226
-0.349341 -0.38208452 0.36655244 0.54194874 -0.3506137 -0.64728457
-0.65838987 -0.16367593 0.54141736 -0.0226023 0.00804346 0.60278076
0.3094938 0.4569762 0.19062182 -0.16526458 -0.07210082 0.06603862
-0.37457517 -0.06995157 0.2903987 -0.85802996 0.42545018 0.03769855
-0.5160426 -0.19277929 0.5093559 0.7047735 -0.3821691 0.6485271
0.33717299 0.3546024 -0.0216026 0.5849844 -0.48152915 -0.78493017
-0.31315103 -0.1663884 0.20950182 -0.05740142 -0.12197259 -0.07631516
-0.15731537 -0.2601161 -0.14830975 -0.31281745 -0.24172977 0.25233367
1.2234598 0.27193925 -0.02711146 0.47753385 0.5837248 -0.04738154
-0.00708766 0.42572117 0.32611084 0.18551737 -0.24722715 0.06850826
-0.4502812 -0.3984893 0.1274212 -0.27696043 0.13879256 -0.06158809
-0.11923014 0.55439365 -0.41655758 -0.26115298 0.46465078 -0.26910457
0.05811554 0.046214 -0.7858561 0.40739098 -0.6487383 0.6730294
0.2106631 0.29240295 0.7696176 0.975032 -0.13616319 -0.17160256
-0.19087991 0.15964532 -0.33032358 0.85688114 0.17160551 0.4981394
-0.1581461 0.7563703 -0.44698533 0.3466059 -0.10070399 0.28501463
0.32532722 0.15417464 -0.01453329 0.3825714 0.18436585 -0.04536625
0.30313858 -0.3605108 -0.54537636 -0.04197651 -0.26343754 -0.6269826
-0.5499365 -0.3669463 -0.09359495 -0.5682116 0.32889694 -0.07315192
-0.6358487 -0.48084268 0.13027188 0.03215803 -0.16152091 -0.73845464
-0.15601031 0.4761991 0.22539309 0.36586016 0.15618959 -0.19133946
-0.48279822 -0.00507662 0.02642235 -0.66330844 -0.52025175 0.4420447
0.5128313 0.13308395 -0.3759939 0.50870836 0.15718904 0.36212322
0.21187715 0.53234214 -0.2611033 -0.42228213 -0.57010734 -0.5510456
0.03843927 0.32153928 0.52629614 -0.47606683 -0.51008993 -0.6875061
0.2055244 -0.5149827 0.05493334 -0.26992393 -0.02701494 -0.54003483]
Run 1:
('మండల', 0.8946073055267334)
('పంచాయితీ', 0.8903516530990601)
('పంచాయతి', 0.8829277753829956)
('కేంద్రము', 0.8827037811279297)
('శివారు', 0.8784233927726746)
('మండలంలోని', 0.8767666220664978)
('గిద్దలూరు', 0.8746854066848755)
('మండలంలో', 0.8743047714233398)
('కడప', 0.8742556571960449)
('మండలములో', 0.873102605342865)
Run 2:
('మండల', 0.8971183896064758)
('పంచాయితీ', 0.8904078602790833)
('పంచాయతి', 0.8870753645896912)
('శివారు', 0.8835635185241699)
('కేంద్రము', 0.8828266263008118)
('మండలంలో', 0.8812516331672668)
('మండలంలోని', 0.8797062635421753)
('గిద్దలూరు', 0.8780054450035095)
('కడప', 0.8775361180305481)
('చిలకలూరిపేట', 0.8765780925750732)
...
model = gensim.models.doc2vec.Doc2Vec(
vector_size=10,
window=5,
min_count=2,
dm=0,
alpha=0.025,
min_alpha=0.025,
epochs=2500,
workers=3
)
tag_doc = gensim.models.doc2vec.TaggedDocument
sent_list_pr = [tag_doc(nl.word_tokenize(sent), [sent]) for sent in sentences_list]
model.build_vocab(sent_list_pr)
model.train(sent_list_pr, total_examples=model.corpus_count, epochs=model.epochs)
n_samples_test = 1000
res = np.zeros(n_samples_test)
for i in range(n_samples_test):
x1 = model.infer_vector(sentences_list[1].split())
x2 = model.infer_vector(sentences_list[2].split())
d1 = (sp.spatial.distance.cosine(x1, x2)) # those are similar
d2 = (sp.spatial.distance.cosine(x1, x3)) # those are rather different
res[i] = d1/d2
print(min(res)/max(res))
# 0.69
You're not doing actual stochastic-gradient-descent,
It's not clear what your evaluation is trying to test
This is a very very small dataset for `Doc2Vec` – both in count of examples, and in size of each text.
you can see consistent results in the `doc2vec-lee.ipynb` demo notebook bundled with gensim on a similarly small dataset
Using smaller vectors and slightly more epochs may help in such a case
but no such dataset is really going to work well with `Doc2Vec`
print('training...')
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
print('done training')
# evaluate
import numpy as np
import scipy as sp
print('evaluating stability...')
sent_1 = train_corpus[3]._asdict()['words']
sent_2 = train_corpus[33]._asdict()['words']
sent_3 = train_corpus[101]._asdict()['words']
n_samples_test = 1000
res = np.zeros(n_samples_test)
for i in range(n_samples_test):
x1 = model.infer_vector(sent_1)
x2 = model.infer_vector(sent_2)
x3 = model.infer_vector(sent_3)
d1 = (sp.spatial.distance.cosine(x1, x2))
d2 = (sp.spatial.distance.cosine(x1, x3))
res[i] = d1/d2
res_95KI = np.percentile(res, [0.05, 0.95])
print('lower_0.95KI/upper_0.95KI for the distance between two example sentences:')
print(min(res_95KI) / max(res_95KI))
Your code with a dark background color is very hard to read; plain text would be better.
are you still using an `alpha` = `min_alpha`?
Your current code doesn't show your current parameters, so it's unclear what's been changed since your 1st post
And your evaluation is still a bit murky, being just a single set of 3 docs,
A more common evaluation is whether the top few nearest neighbors are generally the same, in generally the same rank order. Perhaps your ratio-of-ratios score can jump around a lot, especially for docs that aren't very close to each other, even in a model that's working well for other purposes.
What version of gensim are you using?
I am on the 3.5.0 btw.