These NaNs are definitely because of OOV (out-of-vocabulary) words: beginning-of-sentence and end-of-sentence. Google n-grams do not have those but default lm.scoreSentence() method adds them before and after the provided text via BoundList. As a quick fix, I implemented scorePhrase() that does not bound sentence but I wonder what's the "proper" way to handle this?
//logProb += lm.scoreSentence(words);
logProb += scorePhrase(words,lm);
public static <T> float scorePhrase(final List<T> sentence, final NgramLanguageModel<T> lm) {
final int lmOrder = lm.getLmOrder();
System.out.println("LM order: "+lmOrder);
System.out.println("Phrase length: "+sentence.size());
float sentenceScore = 0.0f;
for (int i = 0; i < lmOrder - 1 && i <= sentence.size(); ++i) {
final List<T> ngram = sentence.subList(0, i+1);
System.out.println("first loop i="+i+", ngram: "+ngram);
final float scoreNgram = lm.getLogProb(ngram);
sentenceScore += scoreNgram;
}
for (int i = lmOrder - 1; i < sentence.size(); ++i) {
final List<T> ngram = sentence.subList(i - lmOrder+1, i+1);
System.out.println("second loop i="+i+", ngram: "+ngram);
final float scoreNgram = lm.getLogProb(ngram);
sentenceScore += scoreNgram;
}
return sentenceScore;
}