[flaptor-util commit] r194 - Added a PDF parser and refactored the HtmlParser to make its parts more generic and reuse ...

0 views

Skip to first unread message

codesite...@google.com

unread,

May 15, 2009, 7:00:29 PM5/15/09

to flaptor-o...@googlegroups.com

Author: jhandl
Date: Fri May 15 15:09:59 2009
New Revision: 194

Added:
trunk/lib/FontBox-0.2.0-dev.jar (contents, props changed)
trunk/lib/PDFBox-0.7.4-dev.jar (contents, props changed)
trunk/lib/jai_core.jar (contents, props changed)
trunk/src/com/flaptor/util/parser/
trunk/src/com/flaptor/util/parser/HtmlParser.java
- copied, changed from r191,
/trunk/src/com/flaptor/util/HtmlParser.java
trunk/src/com/flaptor/util/parser/IParser.java
trunk/src/com/flaptor/util/parser/ParseOutput.java
trunk/src/com/flaptor/util/parser/PdfParser.java
Removed:
trunk/lib/nutch-0.7.2.jar
trunk/src/com/flaptor/util/HtmlParser.java
Modified:
trunk/src/com/flaptor/util/DocumentParser.java
trunk/src/com/flaptor/util/DomUtil.java
trunk/test/com/flaptor/util/HtmlParserTest.java
trunk/test/com/flaptor/util/cache/FileCacheTest.java
trunk/test/com/flaptor/util/cache/MultiCacheTest.java
trunk/test/com/flaptor/util/cache/TempFileCacheTest.java

Log:
Added a PDF parser and refactored the HtmlParser to make its parts more
generic and reuse them in the new parser.

Added: trunk/lib/FontBox-0.2.0-dev.jar
==============================================================================
Binary file. No diff available.

Added: trunk/lib/PDFBox-0.7.4-dev.jar
==============================================================================
Binary file. No diff available.

Added: trunk/lib/jai_core.jar
==============================================================================
Binary file. No diff available.

Modified: trunk/src/com/flaptor/util/DocumentParser.java
==============================================================================
--- trunk/src/com/flaptor/util/DocumentParser.java (original)
+++ trunk/src/com/flaptor/util/DocumentParser.java Fri May 15 15:09:59 2009
@@ -16,6 +16,7 @@

package com.flaptor.util;

+import java.nio.charset.Charset;
import org.apache.log4j.Logger;
import org.dom4j.Document;
import org.dom4j.DocumentException;
@@ -59,11 +60,14 @@
logger.error("genDocument: received null as input.");
throw new IllegalArgumentException();
}
+ if (logger.isDebugEnabled()) {
+ logger.debug("Will parse: "+s);
+ }
Document doc;
SAXReader reader = null;
try {
reader = readers.take();
- doc = reader.read(new org.xml.sax.InputSource(new
java.io.ByteArrayInputStream(s.getBytes())));
+ doc = reader.read(new org.xml.sax.InputSource(new
java.io.StringReader(s)));
} catch (DocumentException e) {
logger.debug("genDocument: cannot convert text to document.",
e);
return null;

Modified: trunk/src/com/flaptor/util/DomUtil.java
==============================================================================
--- trunk/src/com/flaptor/util/DomUtil.java (original)
+++ trunk/src/com/flaptor/util/DomUtil.java Fri May 15 15:09:59 2009
@@ -163,6 +163,7 @@
}

private static final boolean isLegalXml(final char c) {
+ if (c >= 0x92 && c <= 0x97) return false;
return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <=
0xd7ff)
|| (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <=
0x10ffff);
}

Copied: trunk/src/com/flaptor/util/parser/HtmlParser.java (from r191,
/trunk/src/com/flaptor/util/HtmlParser.java)
==============================================================================
--- /trunk/src/com/flaptor/util/HtmlParser.java (original)
+++ trunk/src/com/flaptor/util/parser/HtmlParser.java Fri May 15 15:09:59
2009
@@ -13,17 +13,14 @@
See the License for the specific language governing permissions and
limitations under the License.
*/
-package com.flaptor.util;
+package com.flaptor.util.parser;

import java.io.File;
-import java.net.URI;
import java.net.URISyntaxException;
-import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import java.util.logging.Level;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
@@ -43,15 +40,16 @@

import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ArrayBlockingQueue;
+import org.dom4j.Namespace;
+import org.dom4j.QName;

/**
* This class implements a parser for html documents.
* @author Flaptor Development Team
*/
-public class HtmlParser {
+public class HtmlParser implements IParser {

private static final Logger logger =
Logger.getLogger(Execute.whoAmI());
- private static final String HTMLPARSER_CONTENT
= "HTML_PARSER_CONTENT_FIELD";
private BlockingQueue<DOMParser> parsers;
private String xpathIgnore=null;
// map field - xpath
@@ -131,181 +129,6 @@
if (null == fieldDefinitions) this.fieldDefinitions = new
HashMap<String,String>();
}

- /**
- * Replace all whitespace with one space character and trim.
- * This runs more than 4 times faster than
- * <code>str.replaceAll("\\s+"," ").trim()</code>
- */
- private String collapseWhiteSpace(String str) {
- StringBuffer buf = new StringBuffer();
- boolean inspace = false;
- for (int n=0; n<str.length(); n++) {
- char ch = str.charAt(n);
- if (ch==' ' || ch=='\t' || ch=='\n' || ch=='\f' || ch=='\r' ||
ch==10 || ch==160) { // 160 is   in utf
- if (!inspace) {
- buf.append(' ');
- inspace = true;
- }
- } else {
- buf.append(ch);
- inspace = false;
- }
- }
- return buf.toString().trim();
- }
-
-
- // This method tries to create an URI from a possibly malformed url.
- private static URI getURI(String url) throws URISyntaxException {
- URI uri = null;
- url = url.trim();
- if (url.startsWith("file:") || url.startsWith("javascript:")) {
- logger.debug("Can't handle url: "+url);
- } else {
- int p = url.indexOf('?');
- if (p < 0) {
- try {
- uri = new URI(url.replace(" ", "%20"));
- } catch (java.net.URISyntaxException e) {
- logger.debug("Malformed URI: "+url);
- }
- } else {
- String base, query;
- int q = url.lastIndexOf('#');
- if (q < 0) q = url.length();
- if (p < q) {
- base = url.substring(0,p+1);
- query = url.substring(p+1,q);
- } else {
- base = url.substring(0,q)+"?";
- query = url.substring(p+1);
- }
- // Encode any space in the url. Can't use a url encoder because it
would encode stuff like '/' and ':'.
- base = base.replace(" ", "%20");
- try {
- // Re-encode the query part, to handle partially encoded urls.
- query =
java.net.URLEncoder.encode(java.net.URLDecoder.decode(query,"UTF-8"),"UTF-8");
- query = query.replace("%3D","=").replace("%26","&");
- } catch (java.io.UnsupportedEncodingException e) {
- logger.debug("encoding a url", e);
- }
- url = base + query;
- uri = new URI(url);
- }
- }
- return uri;
- }
-
-
- /**
- * The result of the parser is stored in an object of this class.
- * It contains the extracted text, the title and the outlinks.
- */
- public class Output {
- //private StringBuffer buffer;
- private String text;
- private List<Pair<String,String>> links;
- private String title = "";
- private String url = null;
- private URI baseUri = null;
- // map field - content
- private Map<String,StringBuffer> fields;
-
- public Output(String url) throws URISyntaxException {
- this.url = url;
- if (url.length() > 0) {
- baseUri = getURI(url);
- }
- links = new ArrayList<Pair<String,String>>();
- fields = new HashMap<String,StringBuffer>();
- fields.put(HTMLPARSER_CONTENT,new StringBuffer());
- }
-
- public void addFieldString(String field, String str) {
- // check that the str is valid.
- if (null == str || "".equals(str)) {
- logger.debug("field " + field + " is empty");
- return;
- }
-
- // So, find field.
- StringBuffer buffer = fields.get(field);
- if (null == buffer) {
- buffer = new StringBuffer();
- fields.put(field,buffer);
- }
- str = collapseWhiteSpace(str);
- if (str.length() > 0) {
- if (buffer.length() > 0) buffer.append(' ');
- buffer.append(str.trim());
- }
- }
-
- public void addString(String str) {
- addFieldString(HTMLPARSER_CONTENT,str);
- }
-
-
- public void addLink(String url, String anchor) throws
URISyntaxException {
- URI target = getURI(url);
- if (null != target) {
- if (null != baseUri) {
- if (baseUri.getPath() == null || baseUri.getPath().length()
== 0) {
- baseUri = baseUri.resolve(URI.create("/"));
- }
- target = baseUri.resolve(target);
- }
- links.add(new
Pair<String,String>(target.toString(),anchor.trim()));
- }
- }
-
- public void setTitle(String title) {
- this.title = title.trim();
- }
-
- public void setBaseUrl(String baseUrl) throws URISyntaxException {
- baseUri = getURI(baseUrl);
- }
-
- protected void close(){
- text = fields.get(HTMLPARSER_CONTENT).toString();
- text = text.replaceAll("(\\.\\s)+", ". ");
- text = text.replaceAll("\\s\\.", ". ");
- }
-
- public String getText() {
- return text;
- }
-
- public String getUrl() {
- return url;
- }
-
- public List<Pair<String,String>> getLinks() {
- return links;
- }
-
- public String getTitle() {
- return title;
- }
-
- /**
- * Gets the content of the given fieldname.
- *
- * @param fieldName
- * @return The String content of fieldName if present,
- * null otherwise.
- */
- public String getField(String fieldName) {
- StringBuffer sb = fields.get(fieldName);
- if (null != sb) {
- return sb.toString();
- } else {
- return null;
- }
- }
-
- }

/**
@@ -313,16 +136,16 @@
* @param content the html document to parse.
* @return the parsed string.
*/
- public Output parse(String url, String content) throws Exception {
+ public ParseOutput parse(String url, byte[] bytes, String encoding)
throws Exception {
// <html xmlns=...> ==> <html>
- content= REGEXP_HTML.matcher(content).replaceFirst("<html>");
+ String content = REGEXP_HTML.matcher(new
String(bytes,encoding)).replaceFirst("<html>");
// Parser keeps state, synchronize in case its used in a
multi-threaded setting.
- Output out = new Output(url);
+ ParseOutput out = new ParseOutput(url);
DOMParser parser = parsers.take();
try {
try {
// use cyberneko to parse the html documents (even broken
ones)
- org.xml.sax.InputSource inputSource = new
org.xml.sax.InputSource(new
java.io.ByteArrayInputStream(content.getBytes("UTF-8")));
+ org.xml.sax.InputSource inputSource = new
org.xml.sax.InputSource(new java.io.ByteArrayInputStream(bytes));
parser.parse(inputSource);
} catch (Exception e) {
logger.warn("Exception while trying to parse "+url);
@@ -338,6 +161,12 @@
logger.warn("Out of stack memory trying to parse "+url);
throw new Exception(e);
}
+
+ // eliminate any namespace, it breaks xpath
+ removeNamespace((Element) htmlDoc.selectSingleNode("HTML|Html|
html"));
+
+ ignoreXpath(htmlDoc);
+
// this 2 must be before the ignoreXPath, else an ignoreXPath
that
// includes the //TITLE will imply that the title is not
indexed
// extract the links
@@ -346,13 +175,10 @@
// extact the title
extractTitle(htmlDoc, out);

- ignoreXpath(htmlDoc);
-
replaceSeparatorTags(htmlDoc);

// extract the text from the html tags
- extractText(htmlDoc.getRootElement(), out,HTMLPARSER_CONTENT);
-
+ extractText(htmlDoc.getRootElement(), out,
ParseOutput.CONTENT);

// extract special fields
extractFields(htmlDoc,out);
@@ -363,17 +189,38 @@
return out;
}

+ // Removes the namespace from the given element and its children.
+ private void removeNamespace(Element elem) {
+ if (null != elem) {
+ elem.remove(elem.getNamespace());
+
elem.setQName(QName.get(elem.getName(),Namespace.NO_NAMESPACE));
+ removeNamespace(elem.content());
+ }
+ }
+
+ // Removes the namespace from the given elements and their children.
+ @SuppressWarnings("unchecked")
+ private void removeNamespace(List list) {
+ if (null != list) {
+ for (Node node : (List<Node>)list) {
+ if (node.getNodeType() == Node.ATTRIBUTE_NODE) {
+ ((Attribute)node).setNamespace(Namespace.NO_NAMESPACE);
+ } else if (node.getNodeType() == Node.ELEMENT_NODE) {
+ removeNamespace((Element)node);
+ }
+ }
+ }
+ }

-
- private void extractTitle(Document htmlDoc, Output out){
- Node titleNode = htmlDoc.selectSingleNode("//TITLE|//title|
//Title");
+ private void extractTitle(Document htmlDoc, ParseOutput out){
+ Node titleNode = htmlDoc.selectSingleNode("//TITLE|//Title|
//title");
if (null != titleNode) {
out.setTitle(titleNode.getText());
}
}

@SuppressWarnings("unchecked")
- private void extractLinks(Document htmlDoc, Output out) {
+ private void extractLinks(Document htmlDoc, ParseOutput out) {
try {
Node baseNode = htmlDoc.selectSingleNode("//BASE|//Base|
//base");
if (null != baseNode) {
@@ -409,7 +256,7 @@
}
}

- private void extractFields(Document htmlDoc, Output out) {
+ private void extractFields(Document htmlDoc, ParseOutput out) {
for (String field: fieldDefinitions.keySet()) {
String xpath = fieldDefinitions.get(field);
List elements = htmlDoc.selectNodes(xpath);
@@ -462,7 +309,7 @@
if (null == xpathIgnore){
return;
}
- List<Node> nodes = (List<Node>)
htmlDoc.selectNodes(xpathIgnore.toString());
+ List<Node> nodes = (List<Node>) htmlDoc.selectNodes(xpathIgnore);
for (Node node: nodes){
try {
node.detach();
@@ -482,7 +329,7 @@
* should be empty. After return, it contains the readable
text
* of the html and the outlinks.
*/
- protected void extractText(final Element e, final Output out, final
String fieldName) {
+ protected void extractText(final Element e, final ParseOutput out,
final String fieldName) {
//String nodeName = e.getName();
if (!(e.getNodeType() == Node.COMMENT_NODE)) {
int size = e.nodeCount();
@@ -499,7 +346,7 @@
}

public void test(String base, String link) throws Exception {
- Output out = new Output(base);
+ ParseOutput out = new ParseOutput(base);
out.addLink(link,"");
for (Pair<String,String> lnk : out.getLinks()) {
System.out.println(lnk.first());
@@ -512,7 +359,7 @@
String str = FileUtil.readFile(new File(arg[0]));
String url = "http://url.com";
if (arg.length > 1) { url = arg[1]; }
- Output out = parser.parse(url, str);
+ ParseOutput out = parser.parse(url,
str.getBytes("UTF-8"), "UTF-8");
System.out.println("-------------------------------------------");
System.out.println("TITLE: "+out.getTitle());
for (Pair<String,String> link : out.getLinks()) {

Added: trunk/src/com/flaptor/util/parser/IParser.java
==============================================================================
--- (empty file)
+++ trunk/src/com/flaptor/util/parser/IParser.java Fri May 15 15:09:59 2009
@@ -0,0 +1,21 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package com.flaptor.util.parser;
+
+/**
+ *
+ * @author jorge
+ */
+public interface IParser {
+
+ /**
+ * Parse the given html document.
+ * @param content the html document to parse.
+ * @return the parsed string.
+ */
+ public ParseOutput parse(String url, byte[] content, String encoding)
throws Exception;
+
+}

Added: trunk/src/com/flaptor/util/parser/ParseOutput.java
==============================================================================
--- (empty file)
+++ trunk/src/com/flaptor/util/parser/ParseOutput.java Fri May 15 15:09:59
2009
@@ -0,0 +1,217 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package com.flaptor.util.parser;
+
+
+import java.io.File;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.cyberneko.html.parsers.DOMParser;
+import org.dom4j.Attribute;
+import org.dom4j.Document;
+import org.dom4j.Element;
+import org.dom4j.Node;
+import org.dom4j.Text;
+import org.dom4j.dom.DOMDocumentFactory;
+import org.dom4j.io.DOMReader;
+import org.dom4j.tree.DefaultAttribute;
+
+import com.flaptor.util.Execute;
+import com.flaptor.util.FileUtil;
+import com.flaptor.util.Pair;
+
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ArrayBlockingQueue;
+
+/**
+ * The result of the parser is stored in an object of this class.
+ * It contains the extracted text, the title and the outlinks.
+ */
+public class ParseOutput {
+ private static final Logger logger =
Logger.getLogger(Execute.whoAmI());
+ private String text;
+ private List<Pair<String,String>> links;
+ private String title = "";
+ private String url = null;
+ private URI baseUri = null;
+ // map field - content
+ private Map<String,StringBuffer> fields;
+ public static final String CONTENT = "PARSER_CONTENT_FIELD";
+
+ public ParseOutput(String url) throws URISyntaxException {
+ this.url = url;
+ if (url.length() > 0) {
+ baseUri = getURI(url);
+ }
+ links = new ArrayList<Pair<String,String>>();
+ fields = new HashMap<String,StringBuffer>();
+ fields.put(CONTENT,new StringBuffer());
+ }
+
+ public void addFieldString(String field, String str) {
+ // check that the str is valid.
+ if (null == str || "".equals(str)) {
+ logger.debug("field " + field + " is empty");
+ return;
+ }
+
+ // So, find field.
+ StringBuffer buffer = fields.get(field);
+ if (null == buffer) {
+ buffer = new StringBuffer();
+ fields.put(field,buffer);
+ }
+ str = collapseWhiteSpace(str);
+ if (str.length() > 0) {
+ if (buffer.length() > 0) buffer.append(' ');
+ buffer.append(str.trim());
+ }
+ }
+
+ public void addString(String str) {
+ addFieldString(CONTENT,str);
+ }
+
+
+ public void addLink(String url, String anchor) throws
URISyntaxException {
+ URI target = getURI(url);
+ if (null != target) {
+ if (null != baseUri) {
+ if (baseUri.getPath() == null ||
baseUri.getPath().length() == 0) {
+ baseUri = baseUri.resolve(URI.create("/"));
+ }
+ target = baseUri.resolve(target);
+ }
+ links.add(new
Pair<String,String>(target.toString(),anchor.trim()));
+ }
+ }
+
+ public void setTitle(String title) {
+ if (null != title) {
+ this.title = title.trim();
+ }
+ }
+
+ public void setBaseUrl(String baseUrl) throws URISyntaxException {
+ baseUri = getURI(baseUrl);
+ }
+
+ protected void close(){
+ text = fields.get(CONTENT).toString();
+ text = text.replaceAll("(\\.\\s)+", ". ");
+ text = text.replaceAll("\\s\\.", ". ");
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public List<Pair<String,String>> getLinks() {
+ return links;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ /**
+ * Gets the content of the given fieldname.
+ *
+ * @param fieldName
+ * @return The String content of fieldName if present,
+ * null otherwise.
+ */
+ public String getField(String fieldName) {
+ StringBuffer sb = fields.get(fieldName);
+ if (null != sb) {
+ return sb.toString();
+ } else {
+ return null;
+ }
+ }
+
+ // This method tries to create an URI from a possibly
malformed url.
+ private static URI getURI(String url) throws URISyntaxException {
+ URI uri = null;
+ url = url.trim();
+ if (url.startsWith("file:") || url.startsWith("javascript:")) {
+ logger.debug("Can't handle url: "+url);
+ } else {
+ int p = url.indexOf('?');
+ if (p < 0) {
+ try {
+ uri = new URI(url.replace(" ", "%20"));
+ } catch (java.net.URISyntaxException e) {
+ logger.debug("Malformed URI: "+url);
+ }
+ } else {
+ String base, query;
+ int q = url.lastIndexOf('#');
+ if (q < 0) q = url.length();
+ if (p < q) {
+ base = url.substring(0,p+1);
+ query = url.substring(p+1,q);
+ } else {
+ base = url.substring(0,q)+"?";
+ query = url.substring(p+1);
+ }
+ // Encode any space in the url. Can't use a url encoder because it
would encode stuff like '/' and ':'.
+ base = base.replace(" ", "%20");
+ try {
+ // Re-encode the query part, to handle partially encoded urls.
+ query =
java.net.URLEncoder.encode(java.net.URLDecoder.decode(query,"UTF-8"),"UTF-8");
+ query = query.replace("%3D","=").replace("%26","&");
+ } catch (java.io.UnsupportedEncodingException e) {
+ logger.debug("encoding a url", e);
+ }
+ url = base + query;
+ uri = new URI(url);
+ }
+ }
+ return uri;
+ }
+
+
+ /**
+ * Replace all whitespace with one space character and trim.
+ * This runs more than 4 times faster than
+ * <code>str.replaceAll("\\s+"," ").trim()</code>
+ */
+ private String collapseWhiteSpace(String str) {
+ StringBuffer buf = new StringBuffer();
+ boolean inspace = false;
+ for (int n=0; n<str.length(); n++) {
+ char ch = str.charAt(n);
+ if (ch==' ' || ch=='\t' || ch=='\n' || ch=='\f' || ch=='\r' ||
ch==10 || ch==160) { // 160 is   in utf
+ if (!inspace) {
+ buf.append(' ');
+ inspace = true;
+ }
+ } else {
+ buf.append(ch);
+ inspace = false;
+ }
+ }
+ return buf.toString().trim();
+ }
+
+
+
+}
+

Added: trunk/src/com/flaptor/util/parser/PdfParser.java
==============================================================================
--- (empty file)
+++ trunk/src/com/flaptor/util/parser/PdfParser.java Fri May 15 15:09:59
2009
@@ -0,0 +1,48 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+package com.flaptor.util.parser;
+
+import com.flaptor.util.Execute;
+import java.io.ByteArrayInputStream;
+import org.pdfbox.encryption.DocumentEncryption;
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.pdmodel.PDDocumentInformation;
+import org.pdfbox.util.PDFTextStripper;
+
+
+/**
+ *
+ * @author jorge
+ */
+public class PdfParser implements IParser {
+
+
+ public ParseOutput parse(String url, byte[] content, String encoding)
throws Exception {
+ ParseOutput output = null;
+ PDDocument pdf = null;
+ try {
+ PDFParser parser = new PDFParser(new
ByteArrayInputStream(content));
+ parser.parse();
+ pdf = parser.getPDDocument();
+ PDFTextStripper stripper = new PDFTextStripper();
+ String text = stripper.getText(pdf);
+ PDDocumentInformation info = pdf.getDocumentInformation();
+ String title = info.getTitle();
+ output = new ParseOutput(url);
+ output.addFieldString(ParseOutput.CONTENT, text);
+ output.setTitle(title);
+//System.out.println("ENCODING: "+encoding);
+System.out.println("TITLE: "+title);
+System.out.println("TEXT: "+text);
+System.out.println("LEN: "+text.length());
+ } finally {
+ Execute.close(pdf);
+ }
+ return output;
+ }
+
+}

Modified: trunk/test/com/flaptor/util/HtmlParserTest.java
==============================================================================
--- trunk/test/com/flaptor/util/HtmlParserTest.java (original)
+++ trunk/test/com/flaptor/util/HtmlParserTest.java Fri May 15 15:09:59 2009
@@ -15,6 +15,7 @@
*/
package com.flaptor.util;

+import com.flaptor.util.parser.HtmlParser;
import java.util.List;

import org.apache.log4j.Logger;
@@ -23,6 +24,7 @@
import com.flaptor.util.Pair;
import com.flaptor.util.TestCase;
import com.flaptor.util.TestInfo;
+import com.flaptor.util.parser.ParseOutput;

/**
* @author Flaptor Development Team
@@ -31,11 +33,11 @@
@SuppressWarnings("unused")
private static final Logger logger =
Logger.getLogger(Execute.whoAmI());

- private HtmlParser.Output parse(String url, String text, String
ignore) throws Exception {
+ private ParseOutput parse(String url, String text, String ignore)
throws Exception {
String ign= (null == ignore)? "": ignore;
String ur= (null == url)? "http://domain.com/dir/test.html": url;
HtmlParser parser = new HtmlParser(ign, new String[0]);
- HtmlParser.Output out = parser.parse(ur, text);
+ ParseOutput out = parser.parse(ur, text.getBytes("UTF-8"),"UTF-8");
return out;
}

@@ -46,7 +48,7 @@
" <html xmlns=\"http://www.w3.org/1999/xhtml\" >"+
"<sometag attr=\"no\"> one </sometag> " +
"<anothertag> two </anothertag> </html>";
- HtmlParser.Output out = parse(null, text, null);
+ ParseOutput out = parse(null, text, null);
assertTrue("HtmlParser didn't produce expected output", "one
two".equals(out.getText()));
}

@@ -56,7 +58,7 @@
"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"+
" <html xmlns=\"http://www.w3.org/1999/xhtml\" >"+
"<head> <title> the title </title> </head> </html>";
- HtmlParser.Output out = parse(null, text, null);
+ ParseOutput out = parse(null, text, null);
assertTrue("HtmlParser didn't extract the title", "the
title".equals(out.getTitle()));
}

@@ -66,7 +68,7 @@
"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"+
" <html xmlns=\"http://www.w3.org/1999/xhtml\" >"+
"<dontignore> right </dontignore> <ignorethis> wrong </ignorethis>
</html>";
- HtmlParser.Output out = parse(null, text, "//IGNORETHIS");
+ ParseOutput out = parse(null, text, "//IGNORETHIS");
assertTrue("HtmlParser didn't produce expected
output", "right".equals(out.getText()));
}

@@ -75,14 +77,14 @@
@TestInfo(testType = TestInfo.TestType.UNIT)
public void testTextExtraction() throws Exception {
String text = "<html> <sometag attr=\"no\"> one </sometag>
<anothertag> two </anothertag> </html>";
- HtmlParser.Output out = parse("", text, null);
+ ParseOutput out = parse("", text, null);
assertTrue("HtmlParser didn't produce expected output", "one
two".equals(out.getText()));
}

@TestInfo(testType = TestInfo.TestType.UNIT)
public void testTitle() throws Exception {
String text = "<html> <head> <title> the title </title> </head>
</html>";
- HtmlParser.Output out = parse("", text, null);
+ ParseOutput out = parse("", text, null);
assertTrue("HtmlParser didn't extract the title", "the
title".equals(out.getTitle()));
}

@@ -97,7 +99,7 @@
" <a href=\"/dir/six.html\"> six </a> "+
" <a href=\"seven.html\"> seven </a> "+
" </body> </html>";
- HtmlParser.Output out = parse("http://domain.com/dir/test.html",
text, null);
+ ParseOutput out = parse("http://domain.com/dir/test.html", text,
null);
List<Pair<String,String>> links = out.getLinks();
assertEquals("Didn't extract the right number or links", 7,
links.size());

@@ -132,7 +134,7 @@
@TestInfo(testType = TestInfo.TestType.UNIT)
public void testIgnoreTags() throws Exception {
String text = "<html> <dontignore> right </dontignore>
<ignorethis> wrong </ignorethis> </html>";
- HtmlParser.Output out = parse("", text, "//IGNORETHIS");
+ ParseOutput out = parse("", text, "//IGNORETHIS");
assertTrue("HtmlParser didn't produce expected
output", "right".equals(out.getText()));
}

Modified: trunk/test/com/flaptor/util/cache/FileCacheTest.java
==============================================================================
--- trunk/test/com/flaptor/util/cache/FileCacheTest.java (original)
+++ trunk/test/com/flaptor/util/cache/FileCacheTest.java Fri May 15
15:09:59 2009
@@ -40,11 +40,12 @@
public class FileCacheTest extends TestCase {

- private String cacheDir =
FileUtil.createTempDir("testcache", ".tmp").getAbsolutePath();
+ private String cacheDir;
private FileCache<String> cache;
private int MAX_KEY_COUNT = 100;

- public void setUp() {
+ public void setUp() throws Exception {
+ cacheDir =
FileUtil.createTempDir("testcache", ".tmp").getAbsolutePath();
String log4jConfigPath =
com.flaptor.util.FileUtil.getFilePathFromClasspath("log4j.properties");
if (null != log4jConfigPath) {
PropertyConfigurator.configureAndWatch(log4jConfigPath);

Modified: trunk/test/com/flaptor/util/cache/MultiCacheTest.java
==============================================================================
--- trunk/test/com/flaptor/util/cache/MultiCacheTest.java (original)
+++ trunk/test/com/flaptor/util/cache/MultiCacheTest.java Fri May 15
15:09:59 2009
@@ -53,7 +53,7 @@
//This is an integration test because it uses rmi. It could use a fake
rpc...
@TestInfo(testType = TestInfo.TestType.INTEGRATION,
requiresPort = {30000, 30001, 30002, 30003, 30004})
- public void testFind() throws UnsupportedEncodingException {
+ public void testFind() throws Exception {
caches = new ArrayList<FileCache<String>>();
List<RmiServer> servers = new ArrayList<RmiServer>();
List<Pair<String, Integer>> hosts = new
ArrayList<Pair<String,Integer>>();

Modified: trunk/test/com/flaptor/util/cache/TempFileCacheTest.java
==============================================================================
--- trunk/test/com/flaptor/util/cache/TempFileCacheTest.java (original)
+++ trunk/test/com/flaptor/util/cache/TempFileCacheTest.java Fri May 15
15:09:59 2009
@@ -39,11 +39,12 @@
public class TempFileCacheTest extends TestCase {

- private String cacheDir =
FileUtil.createTempDir("testcacheTemp", ".tmp").getAbsolutePath();
+ private String cacheDir;
private FileCache<String> cache;
private int MAX_KEY_COUNT = 100;

- public void setUp() {
+ public void setUp() throws Exception {
+ cacheDir =
FileUtil.createTempDir("testcacheTemp", ".tmp").getAbsolutePath();
String log4jConfigPath =
com.flaptor.util.FileUtil.getFilePathFromClasspath("log4j.properties");
if (null != log4jConfigPath) {
PropertyConfigurator.configureAndWatch(log4jConfigPath);

Reply all

Reply to author

Forward

0 new messages