The problem comes from PDFBox, a library used by Apache Tika, which parses fetched content for Constellio. I tried updating both Tika and PDFBox to the latest version.
Now, the files without security (constellio-03A_Acrobat_6_pdfwriter_1_4.pdf and constellio-03A_Acrobat_6_pdfwriter_1_5.pdf) show up when I search "ipsum".
It is not the case for the files with security : constellio-03P_Acrobat_6_pdfwriter_1_4.pdf and constellio-03P_Acrobat_6_pdfwriter_1_5.pdf.
Title is correctly extracted for the files without security.
I found the problem for the keywords, it will be fixed in the next build. They are now extracted from the files without security.
So content and metadata are extracted correctly when the files don't have security.
WARN [http-8080-4] (PDFParser.java:182) - Parsing Error, Skipping Object
java.io.IOException: Error: Expected an integer type, actual='ÂŽRª çÊq2ä)/Title(SI 'tEáb*—BÉ iïހw8ôÜÅ>ûVAÅ Úuû3Έ 1»¡'
at org.apache.pdfbox.pdfparser.BaseParser.readInt(BaseParser.java:1384)
at org.apache.pdfbox.pdfparser.PDFParser.parseObject(PDFParser.java:499)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:172)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:881)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:846)
at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:74)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:137)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:150)
at com.doculibre.constellio.feedprotocol.FeedProcessor.asContentParse(FeedProcessor.java:282)
at com.doculibre.constellio.feedprotocol.FeedProcessor.asRecord(FeedProcessor.java:344)
at com.doculibre.constellio.feedprotocol.FeedProcessor.addRecord(FeedProcessor.java:209)
at com.doculibre.constellio.feedprotocol.FeedProcessor.processRecord(FeedProcessor.java:124)
at com.doculibre.constellio.feedprotocol.FeedProcessor.processFeed(FeedProcessor.java:92)
at com.doculibre.constellio.feedprotocol.FeedServlet.doPost(FeedServlet.java:118)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:637)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:717)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at com.doculibre.constellio.filters.LocalRequestFilter.doFilter(LocalRequestFilter.java:64)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:298)
at org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:852)
at org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:588)
at org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)
at java.lang.Thread.run(Unknown Source)
WARN [http-8080-4] (PDFParser.java:182) - Parsing Error, Skipping Object
java.io.IOException: expected='endstream' actual='' org.apache.pdfbox.io.PushBackInputStream@199d3fa
at org.apache.pdfbox.pdfparser.BaseParser.parseCOSStream(BaseParser.java:439)
at org.apache.pdfbox.pdfparser.PDFParser.parseObject(PDFParser.java:530)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:172)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:881)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:846)
at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:74)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:137)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:150)
at com.doculibre.constellio.feedprotocol.FeedProcessor.asContentParse(FeedProcessor.java:282)
at com.doculibre.constellio.feedprotocol.FeedProcessor.asRecord(FeedProcessor.java:344)
at com.doculibre.constellio.feedprotocol.FeedProcessor.addRecord(FeedProcessor.java:209)
at com.doculibre.constellio.feedprotocol.FeedProcessor.processRecord(FeedProcessor.java:124)
at com.doculibre.constellio.feedprotocol.FeedProcessor.processFeed(FeedProcessor.java:92)
at com.doculibre.constellio.feedprotocol.FeedServlet.doPost(FeedServlet.java:118)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:637)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:717)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at com.doculibre.constellio.filters.LocalRequestFilter.doFilter(LocalRequestFilter.java:64)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:298)
at org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:852)
at org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:588)
at org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)
at java.lang.Thread.run(Unknown Source)
org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.parser.pdf.PDFParser@18059e6
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:199)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:137)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:150)
at com.doculibre.constellio.feedprotocol.FeedProcessor.asContentParse(FeedProcessor.java:282)
at com.doculibre.constellio.feedprotocol.FeedProcessor.asRecord(FeedProcessor.java:344)
at com.doculibre.constellio.feedprotocol.FeedProcessor.addRecord(FeedProcessor.java:209)
at com.doculibre.constellio.feedprotocol.FeedProcessor.processRecord(FeedProcessor.java:124)
at com.doculibre.constellio.feedprotocol.FeedProcessor.processFeed(FeedProcessor.java:92)
at com.doculibre.constellio.feedprotocol.FeedServlet.doPost(FeedServlet.java:118)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:637)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:717)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at com.doculibre.constellio.filters.LocalRequestFilter.doFilter(LocalRequestFilter.java:64)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:298)
at org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:852)
at org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:588)
at org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)
at java.lang.Thread.run(Unknown Source)
Caused by: java.lang.NullPointerException
at org.apache.pdfbox.pdmodel.PDPageNode.getCount(PDPageNode.java:109)
at org.apache.pdfbox.pdmodel.PDDocument.getNumberOfPages(PDDocument.java:946)
at org.apache.tika.parser.pdf.PDFParser.extractMetadata(PDFParser.java:108)
at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:89)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
... 26 more
Thank you for your help!
Regards,
Vincent Dussault