I want to training the dataset using SVC. But first, i have to encoding the text data into sentence vector. And the result is not size-1 array
w2v = Word2Vec.load("D:/Coding/JupyterNotebook/Notebook Kampus/TA/stage-1-feature-engineering/models/hate_speech.w2v").wv
def norm_sent_vector(sentence, w2v_model):
try:
vecs = [w2v_model[word] for word in word_tokenize(sentence)]
norm_vecs = [vec / np.linalg.norm(vec) for vec in vecs if np.linalg.norm(vec) > 0]
sent_vec = np.mean(norm_vecs, axis=0)
return sent_vec
except:
print("word not found!")
vecs = [norm_sent_vector(sentence, w2v) for sentence in data.tweet]
vecs = np.array(vecs)
vecs
Output:
array([array([ 0.00632363, -0.03356985, -0.03617402, 0.00972518, 0.0266466 ,
0.02469892, -0.00310688, 0.03239372, 0.01563407, -0.0222201 ,
-0.04705418, 0.03667994, -0.06271628, 0.03091527, -0.0284467 ,
-0.03982268, 0.04624079, -0.03298965, 0.01474145, 0.11688127,
-0.0961338 , -0.0336597 , 0.0855959 , 0.06406952, 0.03522878,
0.09955541, -0.02987629, 0.02637229, -0.04772396, -0.05716022,
-0.02559921, 0.05083385, 0.03838678, -0.07374447, 0.00276218,
0.04251185, 0.07072441, -0.03106852, 0.0277851 , -0.06000457,
0.00935645, -0.0195805 , -0.00093024, -0.08371925, 0.06028785,
0.06150095, 0.05082401, -0.00099226, -0.03021681, -0.02710788,
0.01895522, 0.04998074, -0.08652449, -0.00401581, -0.0057878 ,
-0.0390181 , 0.02547539, 0.00238687, 0.01446107, -0.01878031,
0.01314872, 0.01098322, -0.00078208, 0.05600083, 0.05176528,
-0.0502908 , -0.01713197, 0.01416416, -0.07289032, -0.01821011,
0.05095656, 0.0214082 , -0.02708665, -0.07667084, -0.07586036,
0.05231574, -0.0791928 , 0.00970304, -0.03279269, 0.06451155,
0.00367095, 0.00541698, -0.02214314, 0.06261798, -0.01770678,
0.03217777, 0.03141373, 0.02568904, 0.05533632, -0.10579625,
0.09694292, -0.07045539, -0.05598944, -0.0457988 , 0.06682058,
-0.01050855, -0.00599292, -0.04044507, -0.0350836 , -0.00566648,
-0.02463416, 0.02088067, 0.06241327, 0.00905937, 0.09502037,
-0.00282625, -0.0104612 , -0.01573066, -0.07623975, 0.03399085,
-0.02766567, -0.00087084, -0.06016396, -0.0707435 , 0.07834147,
0.01159797, -0.00509208, 0.02738192, 0.04024412, -0.0968252 ,
0.01055711, 0.0402279 , 0.03283378, 0.03807291, 0.03345452,
0.02840748, 0.02301611, -0.01852652, 0.04250708, -0.00078253,
-0.02666778, 0.04215797, 0.01010866, 0.02289993, -0.03999452,
-0.06014257, -0.11041579, 0.00035835, -0.05725493, -0.05187046,
0.06514149, -0.06358718, 0.00101701, -0.04400118, 0.11000609,
0.02135473, -0.03174777, 0.0879044 , 0.02414304, -0.07474881],
dtype=float32) ,
None,
array([-0.04065338, -0.00428748, -0.01157461, -0.03719373, 0.01805919,
-0.03912182, -0.04106779, 0.00291351, 0.05014409, 0.00334532,
-0.01353863, 0.02564359, -0.00675541, 0.01965495, -0.01940909,
0.01631385, 0.02964206, -0.02185764, -0.02775703, 0.05125 ,
0.00232606, -0.031143 , 0.05930983, 0.04964527, 0.06315636,
-0.0327155 , -0.06130639, 0.0237427 , -0.01971344, -0.06758553,
-0.02744064, 0.00376569, -0.02475217, -0.02600342, -0.0110511 ,
0.05073985, 0.06027102, 0.04363936, 0.02619999, -0.01156487,
-0.01345543, -0.01458246, -0.06445442, -0.03553389, 0.05216108,
0.01495459, -0.00397028, -0.01600757, 0.0480518 , 0.03433325,
-0.04568774, 0.00467343, -0.03582645, -0.03687607, 0.01633763,
0.04380309, 0.01136094, 0.03097592, 0.01010826, -0.06226992,
0.02969844, 0.005232 , -0.01693613, -0.00021746, 0.00290578,
-0.01977911, -0.06270547, -0.01008575, -0.05970311, -0.00782933,
0.05200034, -0.0205521 , 0.03461406, -0.04297352, 0.04185179,
-0.02615889, -0.05604076, -0.00019589, -0.05551782, -0.02537026,
0.00453913, -0.02017258, -0.06115638, 0.04619998, -0.01799767,
0.03694448, -0.03901193, -0.04005376, 0.05768251, -0.05729094,
0.03530023, -0.03273493, -0.04151433, -0.01760735, -0.0004833 ,
-0.03294881, 0.01443356, -0.02510198, -0.09584749, 0.06003863,
0.0224578 , -0.04861711, 0.00472098, -0.00647888, -0.07548124,
-0.02782706, 0.04671133, 0.02352533, -0.08836451, -0.03602425,
-0.01899292, -0.00113788, -0.0392665 , -0.03655483, -0.01547347,
-0.05243741, -0.04805057, 0.00898958, 0.01065749, -0.05843276,
0.04038758, 0.05503979, 0.02836872, -0.03724481, 0.00592487,
-0.06210911, 0.04555667, -0.0487958 , -0.03390039, -0.01222485,
-0.03655333, 0.06790128, -0.02522331, 0.00937718, -0.01771854,
0.00799133, -0.04353032, -0.05314357, -0.04000784, -0.03786252,
0.03144146, -0.01709945, 0.00413258, -0.03574648, 0.01931074,
0.0607012 , -0.07848602, -0.00597325, 0.02126054, -0.1066304 ],
dtype=float32) ,
...,
array([ 0.00168325, 0.01638637, -0.05367426, 0.08605516, -0.03748937,
-0.04924987, 0.02721536, 0.13147382, 0.03386574, 0.00416609,
0.04842668, 0.04721272, -0.04846806, 0.02423235, -0.04041237,
-0.00400642, 0.05177186, -0.00774742, -0.10373266, -0.01066063,
-0.05011854, 0.00495699, 0.00098719, 0.00976519, 0.07417786,
0.06826133, 0.0408436 , -0.01987958, -0.01320286, -0.00558918,
-0.09120162, 0.01626678, -0.05071818, -0.04381114, -0.03168993,
-0.08101221, 0.06751152, -0.03111503, -0.07077248, -0.02585161,
0.04247238, 0.02903543, -0.06709499, -0.03988874, 0.04699219,
0.01772732, -0.01254724, -0.06157243, 0.02071143, 0.05730015,
-0.08267187, 0.03293799, -0.08750468, -0.09172639, 0.05151357,
-0.03130093, -0.03937228, 0.05630299, -0.02704278, -0.1057361 ,
0.00967071, 0.05062063, -0.10159098, -0.06702511, 0.06561594,
-0.10858823, -0.02900201, -0.04686756, -0.06655417, -0.15236542,
-0.04066447, 0.04934151, 0.10957249, 0.0316916 , -0.00775349,
0.0132138 , 0.03741224, 0.01879945, -0.10446768, 0.00598823,
0.0109809 , -0.00630261, -0.09353272, 0.01345601, 0.00029592,
0.04444457, 0.04835159, -0.02964055, 0.01560507, -0.06497473,
0.06874444, 0.04330424, -0.04031433, 0.05050981, 0.0102016 ,
0.1506389 , -0.01208753, -0.05479022, 0.03915089, -0.00618634,
0.05952615, 0.07473771, -0.03127231, 0.00732643, -0.04190646,
-0.11359497, -0.01590518, -0.04081123, -0.07453164, 0.0311028 ,
0.0076575 , -0.05185301, -0.00796055, -0.09137504, -0.02216516,
-0.01880144, 0.02150154, 0.02496032, 0.01707103, -0.046459 ,
-0.02846872, 0.10576995, -0.09595971, 0.05278395, 0.0255553 ,
0.03537694, 0.02434253, 0.01373013, -0.04254844, -0.05353181,
0.02321923, 0.11073506, -0.03804 , -0.02292544, -0.05252758,
0.02068404, -0.04512409, -0.03305526, 0.00880872, -0.06236563,
0.02452703, -0.00834062, 0.07232579, -0.08096729, 0.00933873,
0.07633684, -0.09527048, -0.0258839 , -0.02106833, -0.04423825],
dtype=float32) ,
array([-0.06187889, 0.05062041, -0.02499313, 0.04687024, 0.01484974,
-0.04626938, 0.01295826, 0.04960044, -0.08953588, 0.0252972 ,
0.01052297, 0.0218578 , -0.04553835, 0.05899736, -0.10645358,
-0.01921035, -0.0031419 , -0.01791026, -0.05660039, 0.0623076 ,
-0.07407988, -0.04916784, 0.09025822, 0.00552346, 0.0451568 ,
-0.01830219, 0.00440929, -0.05965381, -0.00414539, 0.00175044,
-0.01289184, -0.02480439, -0.0385672 , -0.04285608, 0.00364256,
0.0607798 , 0.09509839, 0.04049618, 0.06243602, -0.03223979,
-0.11959995, -0.05009869, -0.04220425, -0.0521507 , -0.01379632,
0.06505813, -0.05269279, -0.03362237, 0.05759918, -0.05531896,
-0.02979755, 0.08952799, 0.02481009, 0.07085644, -0.08377787,
-0.06292289, -0.00628276, 0.01904431, 0.01013082, -0.02211456,
0.00551332, 0.0396571 , -0.06194626, -0.02322994, -0.01704296,
-0.08567559, -0.10104163, -0.08923484, -0.15808633, -0.04181664,
0.00344296, -0.05279984, -0.02163424, -0.0047822 , -0.04054481,
-0.01871159, -0.03368428, 0.03268794, 0.00933631, 0.01682878,
0.00730317, -0.05426698, 0.01407813, 0.08559947, -0.07966338,
0.05666449, 0.0407485 , -0.02651764, 0.06902798, -0.04812952,
0.04762729, -0.06253072, -0.04611988, 0.0182886 , -0.01677502,
-0.00865815, 0.07996501, -0.04250925, -0.01579251, 0.00702853,
-0.01178278, 0.04611131, 0.02575719, 0.02037268, -0.04684716,
-0.06514345, 0.0857558 , -0.08016036, -0.00539184, -0.04143929,
-0.0058834 , -0.04062827, -0.02942756, -0.00280544, -0.02552972,
-0.04107826, -0.05802528, -0.03264612, 0.00122783, -0.04317533,
0.04851945, 0.01049053, -0.02535537, 0.04659666, -0.00852434,
0.00684204, 0.02644191, -0.07806855, -0.02043919, -0.01726926,
0.03728927, 0.04636593, 0.01782571, -0.10960857, -0.00496865,
0.03074102, -0.01312088, -0.010346 , -0.07232931, -0.02151999,
0.02366854, -0.06609415, 0.02682632, 0.00790856, -0.01080798,
-0.04575358, 0.0163749 , 0.0022334 , 0.09205522, 0.00181334],
dtype=float32) ,
array([-0.11340133, -0.03065359, 0.02413534, -0.01122673, -0.0557392 ,
0.00200882, -0.05563955, 0.04004776, -0.02159484, 0.04745349,
-0.06249322, 0.00059426, -0.09487826, -0.04617008, -0.10733646,
-0.07417727, -0.00520983, -0.10714733, 0.04718151, -0.03096835,
-0.09150663, 0.06079571, 0.10395265, 0.02417246, 0.01644548,
0.03516121, -0.00594802, -0.01207925, 0.01057743, -0.00468407,
0.03017103, 0.01919374, 0.02906853, -0.01309876, -0.01260625,
0.03516143, 0.03084564, -0.07132443, -0.06633206, -0.0178873 ,
0.03749521, -0.03156273, -0.02408765, -0.09042291, 0.04765481,
-0.01248983, 0.00996245, -0.04038351, -0.00041423, 0.06531069,
-0.07466106, 0.02554415, -0.14685085, 0.02376178, -0.01238975,
0.02546646, -0.04710815, -0.05991672, 0.06200522, -0.08521954,
-0.02680038, -0.00176002, 0.0544721 , -0.02854763, 0.0711354 ,
-0.04419874, 0.02190586, 0.03206788, 0.00173228, 0.02474309,
0.07488473, 0.02814417, -0.0163897 , -0.03462338, 0.0035369 ,
0.00055839, -0.02332698, -0.07069442, -0.031261 , 0.00513294,
-0.01367585, 0.02549426, 0.04590523, 0.00642319, 0.03427031,
0.0177394 , -0.04381903, -0.05257487, 0.03319243, 0.00744779,
0.04137591, -0.07645623, -0.02033906, 0.03742943, 0.01232368,
0.08448125, -0.03447416, -0.03485459, -0.0517838 , 0.09518213,
0.06365148, -0.02258487, -0.06706581, -0.01235124, 0.03343375,
-0.08848827, 0.01676641, 0.07352041, -0.12742457, -0.02394851,
0.05628035, 0.02672487, 0.04395523, -0.02335438, -0.03335742,
-0.01781333, -0.02812773, 0.03969663, 0.03918607, 0.00988915,
0.06104726, 0.03072048, 0.01618503, -0.08566879, 0.05999809,
0.00489021, 0.01867484, -0.13804404, 0.03694502, -0.0229483 ,
0.028878 , 0.09032807, 0.04099076, -0.04974495, -0.05072959,
-0.01215555, -0.10537828, -0.03521769, -0.05605902, -0.07314663,
0.05596965, -0.03960595, 0.0215279 , -0.03002405, 0.10023694,
0.08847097, -0.02842197, -0.03350435, 0.01025076, -0.06253469],
dtype=float32) ],
dtype=object)
vecs.shape
Output:
(13169,)
pipeline = Pipeline([
('algo', SVC(max_iter=500))
])
model = RandomizedSearchCV(pipeline, rsp.svm_params, cv=4, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)
print(model.best_prams_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))
Output:
200 fits failed with the following error:
TypeError: only size-1 arrays can be converted to Python scalars
ValueError: setting an array element with a sequence.