Search text in PDF and highlight them (in java)

1,345 views
Skip to first unread message

Gary Liang

unread,
May 8, 2008, 9:36:15 PM5/8/08
to pdfne...@googlegroups.com
Hi:

I am writing a java version of "search text in PDF & highlight", based on a C# version in this group.
I am not 100% understand the code, as I realize that the code doesn't pass the word to the highlight function, but
the bbox (I assume it is bounding box).

So if I use txt.begin(currPage), it will output images, but it obviously won't highlight.
If I use txt.begin(currPage, word_bbox), it won't output anything. I think word_bbox points to nowhere, so it can't find
text to highlight.

Please look at the code below, it should give you a better what I want to do.

cheers
gary


Code:
---------------------------------
package pdfsearch;

import java.io.File;

import pdftron.Common.PDFNetException;
import pdftron.PDF.Annot;
import pdftron.PDF.ColorPt;
import pdftron.PDF.ColorSpace;
import pdftron.PDF.Element;
import pdftron.PDF.ElementBuilder;
import pdftron.PDF.ElementWriter;
import pdftron.PDF.GState;
import pdftron.PDF.PDFDoc;
import pdftron.PDF.PDFDraw;
import pdftron.PDF.PDFNet;
import pdftron.PDF.Page;
import pdftron.PDF.PageIterator;
import pdftron.PDF.Rect;
import pdftron.PDF.TextExtractor;
import pdftron.SDF.Obj;

public class PDFSearch
{
    static boolean containThisWord(String serchTerm, String needleTerm)
    {

        String[] array = serchTerm.split(" ");
       
        for(String searchWord: array)
        {
            int condi = searchWord.compareToIgnoreCase(needleTerm);
            if(condi == 0)
            {
                return true;
            }
        }
       
        return false;
    }
   
    // Use PDFNet to generate appearance stream for highlight
    static Obj CreateHighlightAppearance(PDFDoc doc, Rect bbox, ColorPt
            higlight_color)
    {
        Obj stm = null;
       
        try
        {
            // Create a button appearance stream
            ElementBuilder build = new ElementBuilder();
            ElementWriter writer = new ElementWriter();
            writer.begin(doc);
   
            // Draw background
            double x1 = bbox.getX1();
            double y1 = bbox.getY1();
            double x2 = bbox.getX2();
            double y2 = bbox.getY2();
           
            Element element = build.createRect(x1- 2, y1, x2 +
                    2, y2);
            element.setPathFill(true);
            element.setPathStroke(false);
            GState gs = element.getGState();
            gs.setFillColorSpace(ColorSpace.createDeviceRGB());
            gs.setFillColor(higlight_color);
            gs.setBlendMode(GState.e_bl_multiply);
            writer.writeElement(element);
            stm = writer.end();
   
            // Set the bounding box
            stm.putRect("BBox", x1, y1, x2, y2);
            stm.putName("Subtype", "Form");
        }
        catch(PDFNetException e)
        {
            e.printStackTrace();
        }
       
        return stm;
    }
   
    // Create Highlight Annotation.
    static Annot CreateHighlightAnnot(PDFDoc doc, Rect bbox, ColorPt
            highlight_color)
    {
        Annot a = null;
       
        try
        {
            a = Annot.create(doc, Annot.e_Highlight, bbox);
            a.setColor(highlight_color);
            a.setAppearance(CreateHighlightAppearance(doc, bbox, highlight_color));
           
            double x1 = bbox.getX1();
            double y1 = bbox.getY1();
            double x2 = bbox.getX2();
            double y2 = bbox.getY2();
           
            Obj quads = doc.createIndirectArray();
           
            quads.pushBackNumber(x1);
            quads.pushBackNumber(y2);
            quads.pushBackNumber(x2);
            quads.pushBackNumber(y2);
            quads.pushBackNumber(x1);
            quads.pushBackNumber(y1);
            quads.pushBackNumber(x2);
            quads.pushBackNumber(y1);
           
            a.getSDFObj().put("QuadPoints", quads);
        }
        catch(PDFNetException e)
        {
            e.printStackTrace();
        }
       
        return a;
    }
   
   
   
    public static void main(String[] args)
    {
        // Commmand line input

       
        // Get current directory path
        File file = new File("");
        String abolutePath = file.getAbsolutePath();
       
        PDFNet.initialize();
        PDFNet.setResourcesPath(abolutePath + "/resources");
        String inputPath = abolutePath + "/input";
        String outputPath = abolutePath + "/output";
       
        try
        {
            String searchTerm = "google file system";
            PDFDoc doc = new PDFDoc(inputPath + "/input.pdf");
            doc.initSecurityHandler();
           
            ColorPt highlight_color = new ColorPt(1, 1, 0);  // Yellow
            TextExtractor txt = new TextExtractor(); // Used to extract words
            Rect word_bbox = new Rect();
           
            int dpi = 150;
            PDFDraw draw=new PDFDraw(dpi);
           
            // Go through each page
            int pageNum = 0;
            for (PageIterator itr=doc.getPageIterator(); itr.hasNext();)
            {
                Page currPage = (Page)(itr.next());
               
                // Read the page.
                txt.begin(currPage, word_bbox);
                //txt.begin(currPage);
               
                //Extract words one by one.
                for (TextExtractor.Line line = txt.getFirstLine(); line.isValid(); line=line.getNextLine())
                {
                    for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word=word.getNextWord())
                    {
                        String wordStr = word.getString();
                        if(containThisWord(searchTerm, wordStr) == true)
                        {
                            currPage.annotPushBack(CreateHighlightAnnot(doc, word_bbox,
                                    highlight_color));
                        }
                    }
                   
                    //String outname = String.Format("{0}out{1:d}.jpg", outputPath, (Object)"0"); // itr.getPageNumber() //*
                    String outname = outputPath + "/" + "out" + pageNum + ".jpg";
                    draw.export(currPage, outname, "jpg");
                }
               
                ++pageNum;
               
                //test
                if(pageNum > 2)
                {
                    break;
                }
               
            }
           
            // Complete search and highlight
            System.out.println("Search & highlight completed");
           
        }
        catch(PDFNetException e)
        {
            e.printStackTrace();
        }
       
    }
   
}










--
http://gary.liang.cs.research.googlepages.com/index.html

Gary Liang

unread,
May 9, 2008, 11:12:59 PM5/9/08
to pdfne...@googlegroups.com
Fixed the error. Don't worry about it.
--
http://gary.liang.cs.research.googlepages.com/index.html

Gary Liang

unread,
May 11, 2008, 8:02:38 AM5/11/08
to pdfne...@googlegroups.com
Hi:

I am running linux and be able to compile my code without errors.

I include this in my java code:
System.load("/home/me/projects/pdfsearch/lib/PDFNetC/Lib/libPDFNetC.so");

I run:
java  -classpath .:/home/me/projects/pdfsearch/lib/PDFNetC/Lib/PDFNet.jar PDFSearch

I get this error:
Exception in thread "main" java.lang.UnsatisfiedLinkError: no PDFNetC in java.library.path
        at java.lang.ClassLoader.loadLibrary(ClassLoader.java:1682)
        at java.lang.Runtime.loadLibrary0(Runtime.java:823)
        at java.lang.System.loadLibrary(System.java:1030)
        at pdftron.PDF.PDFNet.<clinit>(PDFNet.java:17)
        at PDFSearch.main(PDFSearch.java:146)
--
http://gary.liang.cs.research.googlepages.com/index.html

Support

unread,
May 12, 2008, 2:31:58 PM5/12/08
to PDFTron PDFNet SDK

Are you able to run any of JAVA samples on Linux? ('cd /PDFNet/
Samples', then 'sh runall_java.sh').

It is possible that Java can't find the shared library. I tried to set
LD_LIBRARY_PATH and this seems to do the trick:

LD_LIBRARY_PATH=/PDFNet/Lib/:$LD_LIBRARY_PATH
java -classpath .:/PDFNet/Lib/PDFNet.jar MyTest

For more info, please search "LD_LIBRARY_PATH" on the Net.
In case you would like to avoid setting the "LD_LIBRARY_PATH", you may
want to copy the shared library to a standard search path for shared
libraries (e.g. '/usr/local/lib').
Reply all
Reply to author
Forward
0 new messages