Hello!
It seems that newer version of library eliminated the problem with too much memory consumption, but i've faced with something new. I caught an 'Exception : InvalidOperation : Length mismatch in fields Batch.class_id and Batch.token,
batch.id = 3067fe9b-ff7f-0000-7905-7355f17f0000' on reading ThetaMatrix batch by batch. The point is that i haven't batch with the same name in my batch folder.
Code for reading ThetaMatrix:
fs::recursive_directory_iterator it(options.batch_folder);
fs::recursive_directory_iterator endit;
int csvn=0;
while (it != endit)
{
if (fs::is_regular_file(*it) || it->path().extension() == ".batch")
{
std::shared_ptr<Batch> batch = artm::LoadBatch(it->path().string());
artm::GetThetaMatrixArgs args;
args.mutable_batch()->CopyFrom(*batch);
std::shared_ptr< ::artm::ThetaMatrix> theta = master_component->GetThetaMatrix(args);
csvn++;
std::string csvname = "/home/snapper/topic_modelling2/topics/theta_" +std::to_string(csvn)+ ".csv";
std::cout<< "writing to:"<< csvname << std::endl;
std::ofstream of(csvname.c_str());
artm::ThetaMatrix* theta_matrix = theta.get();
std::cout<<"items in theta:"<<theta_matrix->item_id_size() <<std::endl;
for (int i=0;i<theta_matrix->item_id_size();i++)
{
std::vector <double> probs;
for (int j=0;j<theta_matrix->item_weights(i).value_size(); j++)
{
probs.push_back(theta_matrix->item_weights(i).value(j));
}
std::vector<size_t> sorted=sort_indexes(probs);
of<<theta_matrix->item_id(i)<<",";
int num_themes=10;
for (int j=0;j<num_themes;j++)
{
if (j>0) of<<",";
of<<sorted[j];
}
of<<std::endl<<theta_matrix->item_id(i)<<",";
for (int j=0;j<num_themes;j++)
{
if (j>0) of<<",";
of<<probs[sorted[j]];
}
of<<std::endl;
}
of.close();
}
++it;
}
Log file is included.
Sincerely, Alex.