Thanks for your answer, I have been doing some improvements. Now I am able to retrieve some words and concepts
I'm using an external JAVA project to index the documents inside the core (solr-solrj) and I have not changed jate.properties file.
The main changes in my schema.xml have been in the two jate fieldtypes: jate_text_2_ngrams and jate_text_2_terms
The problem for Spanish is that there are not models for version 1.5 of OpenNLP. I found someones in github ( for sentence splitting and pos). Also I have configured my own spanish stopwords list.
I have to find they way to improve one of these two filters. Have anyone changed the class of JATE library to work with other languages or have used other type of filter for the same purpose?
<fieldType name="jate_text_2_ngrams" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="org.apache.lucene.analysis.jate.OpenNLPTokenizerFactory"
sentenceModel="es-sent.bin"
tokenizerModel="en-token.bin"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.OpenNLPPOSTaggerFactory"
posTaggerClass="uk.ac.shef.dcs.jate.nlp.opennlp.POSTaggerOpenNLP"
posTaggerModel="opennlp-es-maxent-pos-es.bin"/>
<filter class="org.apache.lucene.analysis.jate.ComplexShingleFilterFactory" minTokens="2" maxTokens="5"
maxCharLength="40" minCharLength="2" removeLeadingStopWords="true"
removeTrailingStopWords="true" removeLeadingSymbolicTokens="true"
removeTrailingSymbolicTokens="true"
stripAnySymbolChars="false"
stripLeadingSymbolChars="true" stripTrailingSymbolChars="true"
stopWords="stopwords.txt" stopWordsIgnoreCase="true"
outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!--a configuration for PoS based candidate extraction-->
<fieldType name="jate_text_2_terms" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="org.apache.lucene.analysis.jate.OpenNLPTokenizerFactory"
sentenceModel="es-sent.bin"
tokenizerModel="en-token.bin"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="org.apache.lucene.analysis.jate.OpenNLPPOSTaggerFactory"
posTaggerClass="uk.ac.shef.dcs.jate.nlp.opennlp.POSTaggerOpenNLP"
posTaggerModel="opennlp-es-maxent-pos-es.bin"/>
<!--filter class="org.apache.lucene.analysis.jate.OpenNLPRegexChunkerFactory"
patterns="aclrdtec.patterns"
minTokens="1" maxTokens="5"
maxCharLength="40" minCharLength="2" removeLeadingStopWords="true"
removeTrailingStopWords="true" removeLeadingSymbolicTokens="true"
removeTrailingSymbolicTokens="true"
stripAnySymbolChars="false"
stripLeadingSymbolChars="true" stripTrailingSymbolChars="true"
stopWords="stopwords.txt" stopWordsIgnoreCase="true"/-->
<filter class="org.apache.lucene.analysis.jate.OpenNLPNounPhraseFilterFactory"
chunkerModel="en-chunker.bin"
minTokens="1" maxTokens="5"
maxCharLength="40" minCharLength="2" removeLeadingStopWords="true"
removeTrailingStopWords="true" removeLeadingSymbolicTokens="true"
removeTrailingSymbolicTokens="true"
stripAnySymbolChars="false"
stripLeadingSymbolChars="true" stripTrailingSymbolChars="true"
stopWords="stopwords.txt" stopWordsIgnoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
</analyzer>
</fieldType>
<!-- ###################### JATE End #############################-->
</types>