http://code.google.com/p/flaptor-util/source/detail?r=201
Modified:
/trunk
/trunk/flaptorPython
/trunk/src/com/flaptor/util/parser/HtmlParser.java
/trunk/src/com/flaptor/util/parser/IParser.java
/trunk/src/com/flaptor/util/timeplot/TimePlotUtils.java
/trunk/test/com/flaptor/util/HtmlParserTest.java
=======================================
--- /trunk/src/com/flaptor/util/parser/HtmlParser.java Thu Aug 6 10:36:48
2009
+++ /trunk/src/com/flaptor/util/parser/HtmlParser.java Mon Aug 31 11:54:51
2009
@@ -133,14 +133,55 @@
/**
* Parse the given html document.
+ *
* @param content the html document to parse.
* @return the parsed string.
*/
- public ParseOutput parse(String url, byte[] bytes, String encoding)
throws Exception {
+ public ParseOutput parse(String url, byte[] bytes) throws Exception {
// <html xmlns=...> ==> <html>
- String content = REGEXP_HTML.matcher(new
String(bytes,encoding)).replaceFirst("<html>");
+
+
// Parser keeps state, synchronize in case its used in a
multi-threaded setting.
ParseOutput out = new ParseOutput(url);
+
+ Document htmlDoc = getHtmlDocument(url, bytes);
+
+ removeNamespace((Element) htmlDoc.selectSingleNode("HTML|Html|
html"));
+
+ ignoreXpath(htmlDoc);
+
+ // this 2 must be before the ignoreXPath, else an ignoreXPath that
+ // includes the //TITLE will imply that the title is not indexed
+ // extract the links
+ extractLinks(htmlDoc, out);
+
+ // extact the title
+ extractTitle(htmlDoc, out);
+
+ replaceSeparatorTags(htmlDoc);
+
+ // extract the text from the html tags
+ extractText(htmlDoc.getRootElement(), out, ParseOutput.CONTENT);
+
+ // extract special fields
+ extractFields(htmlDoc,out);
+
+ // eliminate any namespace, it breaks xpath
+ out.close();
+ return out;
+ }
+
+ /**
+ * Parses and fixes an html byte array using Cybernecko.
+ *
+ * @param url The base URL for relative links
+ * @param bytes The content
+ * @return a dom4j Document
+ * @throws InterruptedException
+ * @throws Exception
+ */
+ public Document getHtmlDocument(String url, byte[] bytes) throws
InterruptedException, Exception {
+ Document htmlDoc;
DOMParser parser = parsers.take();
try {
try {
@@ -152,7 +193,6 @@
throw e;
}
DOMReader reader = new DOMReader();
- Document htmlDoc;
try {
// get the doc that resulted from parsing the text
org.w3c.dom.Document document = parser.getDocument();
@@ -162,31 +202,10 @@
throw new Exception(e);
}
- // eliminate any namespace, it breaks xpath
- removeNamespace((Element) htmlDoc.selectSingleNode("HTML|Html|
html"));
-
- ignoreXpath(htmlDoc);
-
- // this 2 must be before the ignoreXPath, else an ignoreXPath
that
- // includes the //TITLE will imply that the title is not
indexed
- // extract the links
- extractLinks(htmlDoc, out);
-
- // extact the title
- extractTitle(htmlDoc, out);
-
- replaceSeparatorTags(htmlDoc);
-
- // extract the text from the html tags
- extractText(htmlDoc.getRootElement(), out,
ParseOutput.CONTENT);
-
- // extract special fields
- extractFields(htmlDoc,out);
} finally {
parsers.add(parser);
}
- out.close();
- return out;
+ return htmlDoc;
}
// Removes the namespace from the given element and its children.
@@ -359,7 +378,7 @@
String str = FileUtil.readFile(new File(arg[0]));
String url = "http://url.com";
if (arg.length > 1) { url = arg[1]; }
- ParseOutput out = parser.parse(url,
str.getBytes("UTF-8"), "UTF-8");
+ ParseOutput out = parser.parse(url, str.getBytes("UTF-8"));
System.out.println("-------------------------------------------");
System.out.println("TITLE: "+out.getTitle());
for (Pair<String,String> link : out.getLinks()) {
=======================================
--- /trunk/src/com/flaptor/util/parser/IParser.java Fri May 15 15:09:59 2009
+++ /trunk/src/com/flaptor/util/parser/IParser.java Mon Aug 31 11:54:51 2009
@@ -16,6 +16,6 @@
* @param content the html document to parse.
* @return the parsed string.
*/
- public ParseOutput parse(String url, byte[] content, String encoding)
throws Exception;
+ public ParseOutput parse(String url, byte[] content) throws Exception;
}
=======================================
--- /trunk/src/com/flaptor/util/timeplot/TimePlotUtils.java Tue Sep 23
14:43:07 2008
+++ /trunk/src/com/flaptor/util/timeplot/TimePlotUtils.java Mon Aug 31
11:54:51 2009
@@ -108,7 +108,7 @@
valuesRepresentation += decimalFormat.format(value);
}
- buffer.append(dateFormat.format(date) + valuesRepresentation
+ "\\n");
+ buffer.append(dateFormat.format(date) + valuesRepresentation +
rowSeparator);
}
return buffer.toString();
=======================================
--- /trunk/test/com/flaptor/util/HtmlParserTest.java Fri May 15 15:09:59
2009
+++ /trunk/test/com/flaptor/util/HtmlParserTest.java Mon Aug 31 11:54:51
2009
@@ -37,7 +37,7 @@
String ign= (null == ignore)? "": ignore;
String ur= (null == url)? "http://domain.com/dir/test.html": url;
HtmlParser parser = new HtmlParser(ign, new String[0]);
- ParseOutput out = parser.parse(ur, text.getBytes("UTF-8"),"UTF-8");
+ ParseOutput out = parser.parse(ur, text.getBytes("UTF-8"));
return out;
}
@@ -72,7 +72,14 @@
assertTrue("HtmlParser didn't produce expected
output", "right".equals(out.getText()));
}
-
+ @TestInfo(testType = TestInfo.TestType.UNIT)
+ public void testAcceptNoHtmlTag() throws Exception {
+ String text = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0
Strict//EN\" "+
+ "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"+
+ "<dontignore> right </dontignore> <ignorethis> wrong
</ignorethis>";
+ ParseOutput out = parse(null, text, "//IGNORETHIS");
+ assertTrue("HtmlParser didn't produce expected
output", "right".equals(out.getText()));
+ }
@TestInfo(testType = TestInfo.TestType.UNIT)
public void testTextExtraction() throws Exception {