[flaptor-util] r210 committed - Created HyperlinkDocumentFetcher and its dependencies

1 view

Skip to first unread message

codesite...@google.com

unread,

Jun 11, 2010, 1:16:34 PM6/11/10

to flaptor-o...@googlegroups.com

Revision: 210
Author: ignaci...@gmail.com
Date: Fri Jun 11 10:15:40 2010
Log: Created HyperlinkDocumentFetcher and its dependencies
http://code.google.com/p/flaptor-util/source/detail?r=210

Added:
/trunk/src/com/flaptor/util/parser/ParseException.java
/trunk/src/com/flaptor/util/parser/http
/trunk/src/com/flaptor/util/parser/http/FetchResult.java
/trunk/src/com/flaptor/util/parser/http/HtmlParser.java
/trunk/src/com/flaptor/util/parser/http/HttpUrlFetcher.java
/trunk/src/com/flaptor/util/parser/http/HyperLinkDocumentFetcher.java
/trunk/src/com/flaptor/util/parser/http/PdfParser.java

=======================================
--- /dev/null
+++ /trunk/src/com/flaptor/util/parser/ParseException.java Fri Jun 11
10:15:40 2010
@@ -0,0 +1,20 @@
+/**
+ *
+ */
+package com.flaptor.util.parser;
+
+public class ParseException extends Exception {
+ private static final long serialVersionUID = 1L;
+ public ParseException() {
+ super();
+ }
+ public ParseException(String message, Throwable cause) {
+ super(message, cause);
+ }
+ public ParseException(String message) {
+ super(message);
+ }
+ public ParseException(Throwable cause) {
+ super(cause);
+ }
+}
=======================================
--- /dev/null
+++ /trunk/src/com/flaptor/util/parser/http/FetchResult.java Fri Jun 11
10:15:40 2010
@@ -0,0 +1,135 @@
+/**
+ *
+ */
+package com.flaptor.util.parser.http;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.w3c.dom.Document;
+
+import com.flaptor.util.Execute;
+import com.flaptor.util.Pair;
+
+public class FetchResult {
+ private static final Logger logger =
Logger.getLogger(Execute.whoAmI());
+
+ private String cleanText;
+ private Document document;
+ private List<Pair<String,String>> links = Collections.emptyList();
+ private String title = null;
+ private String url = null;
+ private URI baseUri = null;
+
+ private String documentType;
+
+ public FetchResult(String url) throws URISyntaxException {
+ this.url = url;
+ if (url.length() > 0) {
+ baseUri = getURI(url);
+ }
+ links = new ArrayList<Pair<String,String>>();
+ }
+
+ protected void setTitle(String title) {
+ if (null != title) {
+ this.title = title.trim();
+ }
+ }
+
+ protected void setBaseUrl(URI baseUri) throws URISyntaxException {
+ this.baseUri = baseUri;
+ }
+ protected void setBaseUrl(String baseUrl) throws URISyntaxException {
+ baseUri = getURI(baseUrl);
+ }
+
+ public URI getBaseUri() {
+ return baseUri;
+ }
+
+ protected void setText(String cleanText) {
+ this.cleanText = cleanText;
+ }
+
+ public String getText() {
+ return cleanText;
+ }
+
+ public String getUrl() {
+ return url;
+ }
+
+ public List<Pair<String,String>> getLinks() {
+ return links;
+ }
+
+ protected void setLinks(List<Pair<String, String>> links) {
+ this.links = links;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public Document getDocument() {
+ return document;
+ }
+
+ protected void setDocument(Document document) {
+ this.document = document;
+ }
+
+ public String getDocumentType() {
+ return documentType;
+ }
+
+ public void setDocumentType(String documentType) {
+ this.documentType = documentType;
+ }
+
+ public static URI getURI(String url) throws URISyntaxException {
+ URI uri = null;
+ url = url.trim();
+ if (url.startsWith("file:") || url.startsWith("javascript:")) {
+ logger.debug("Can't handle url: "+url);
+ } else {
+ int p = url.indexOf('?');
+ if (p < 0) {
+ try {
+ uri = new URI(url.replace(" ", "%20"));
+ } catch (java.net.URISyntaxException e) {
+ logger.debug("Malformed URI: "+url);
+ }
+ } else {
+ String base, query;
+ int q = url.lastIndexOf('#');
+ if (q < 0) q = url.length();
+ if (p < q) {
+ base = url.substring(0,p+1);
+ query = url.substring(p+1,q);
+ } else {
+ base = url.substring(0,q)+"?";
+ query = url.substring(p+1);
+ }
+ // Encode any space in the url. Can't use a url encoder because it
would encode stuff like '/' and ':'.
+ base = base.replace(" ", "%20");
+ try {
+ // Re-encode the query part, to handle partially encoded urls.
+ query =
java.net.URLEncoder.encode(java.net.URLDecoder.decode(query,"UTF-8"),"UTF-8");
+ query = query.replace("%3D","=").replace("%26","&");
+ } catch (java.io.UnsupportedEncodingException e) {
+ logger.debug("encoding a url", e);
+ }
+ url = base + query;
+ uri = new URI(url);
+ }
+ }
+ return uri;
+ }
+
+}
=======================================
--- /dev/null
+++ /trunk/src/com/flaptor/util/parser/http/HtmlParser.java Fri Jun 11
10:15:40 2010
@@ -0,0 +1,275 @@
+package com.flaptor.util.parser.http;
+
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+
+import org.cyberneko.html.parsers.DOMParser;
+import org.dom4j.Attribute;
+import org.dom4j.Document;
+import org.dom4j.Element;
+import org.dom4j.Namespace;
+import org.dom4j.Node;
+import org.dom4j.QName;
+import org.dom4j.Text;
+import org.dom4j.io.DOMReader;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
+
+import com.flaptor.util.Pair;
+import com.flaptor.util.parser.ParseException;
+import com.flaptor.util.parser.ParseOutput;
+import
com.flaptor.util.parser.http.HyperLinkDocumentFetcher.DocumentParser;
+import com.google.common.collect.Lists;
+
+public class HtmlParser implements DocumentParser {
+ private static final String CYBERNEKO_DEFAULT_DEFAULT_ENCODING
= "Windows-1252";
+ private static final String CYBERNEKO_DEFAULT_ENCODING_PROPERTY
= "http://cyberneko.org/html/properties/default-encoding";
+
+ private BlockingQueue<DOMParser> parsers;
+
+ public HtmlParser() {
+ buildParsers();
+ }
+
+ @Override
+ public FetchResult parse(String url, InputStream inputStream, String
contentType, String encoding, FetchResult currentResult) throws
ParseException {
+ if (!contentType.equals("text/html")) {
+ // Not the right parser
+ return null;
+ }
+
+ String specifiedEncoding = null;
+ if (encoding != null) {
+ specifiedEncoding = encoding;
+ }
+ try {
+ Document htmlDocument = getHtmlDocument(url, inputStream,
specifiedEncoding);
+
+ removeNamespace(htmlDocument.getRootElement());
+
+ URI baseUri = getBaseURI(htmlDocument);
+ if (baseUri != null) {
+ currentResult.setBaseUrl(baseUri);
+ }
+ removeNonPrintableTags(htmlDocument);
+ String text = extractText(htmlDocument.getRootElement());
+ List<Pair<String, String>> links = getLinks(htmlDocument,
currentResult.getBaseUri());
+
+ currentResult.setTitle(getTitle(htmlDocument));
+ currentResult.setLinks(links);
+ currentResult.setText(text);
+
+
+ } catch (Exception e) {
+ throw new ParseException(e);
+ }
+
+ return currentResult;
+ }
+
+ private void buildParsers() {
+ int processors = Runtime.getRuntime().availableProcessors();
+ parsers = new ArrayBlockingQueue<DOMParser>(processors);
+
+ for (int i = 0; i < processors; i++) {
+ DOMParser parser = new org.cyberneko.html.parsers.DOMParser();
+ parsers.add(parser);
+ }
+ }
+
+ private List<Pair<String, String>> getLinks(Document document, URI
baseUri) {
+ List<Pair<String, String>> result = Lists.newArrayList();
+
+ List links = document.selectNodes("//A|//a");
+ for (Iterator iter = links.iterator(); iter.hasNext();) {
+ Element link = (Element) iter.next();
+ Attribute href = link.attribute("href");
+ if (null != href) {
+ try {
+ String url = href.getValue();
+ String text = link.getText();
+
+ URI target = HyperLinkDocumentFetcher.getURI(url);
+ if (null != target) {
+ if (null != baseUri) {
+ if (baseUri.getPath() == null ||
baseUri.getPath().length() == 0) {
+ baseUri = baseUri.resolve(URI.create("/"));
+ }
+ target = baseUri.resolve(target);
+ }
+ result.add(new Pair<String,String>(target.toString(),
text.trim()));
+ }
+
+
+ } catch (URISyntaxException e) {
+ HyperLinkDocumentFetcher.logger.debug("Exception occurred, ignoring
link "
+ + link.getText() + " at " + href.getValue(), e);
+ }
+ }
+ }
+
+ return result;
+
+ }
+
+ // Removes the namespace from the given element and its children.
+ private void removeNamespace(Element elem) {
+ if (null != elem) {
+ elem.remove(elem.getNamespace());
+
elem.setQName(QName.get(elem.getName(),Namespace.NO_NAMESPACE));
+ removeNamespace(elem.content());
+ }
+ }
+
+ // Removes the namespace from the given elements and their children.
+ @SuppressWarnings("unchecked")
+ private void removeNamespace(List list) {
+ if (null != list) {
+ for (Node node : (List<Node>)list) {
+ if (node.getNodeType() == Node.ATTRIBUTE_NODE) {
+ ((Attribute)node).setNamespace(Namespace.NO_NAMESPACE);
+ } else if (node.getNodeType() == Node.ELEMENT_NODE) {
+ removeNamespace((Element)node);
+ }
+ }
+ }
+ }
+
+ private String getTitle(Document htmlDoc){
+ Node titleNode = htmlDoc.selectSingleNode("//TITLE|//Title|
//title");
+ if (null != titleNode) {
+ return titleNode.getText();
+ }
+
+ return null;
+ }
+
+
+ private DOMParser getParser(String specifiedEncoding)
+ throws InterruptedException {
+ DOMParser parser = parsers.take();
+ if (specifiedEncoding != null) {
+ try {
+ parser.setProperty(CYBERNEKO_DEFAULT_ENCODING_PROPERTY,
specifiedEncoding);
+ } catch (SAXNotRecognizedException e) {
+ e.printStackTrace();
+ } catch (SAXNotSupportedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ return parser;
+ }
+
+ private void returnParser(DOMParser parser) {
+ try {
+ parser.setProperty(CYBERNEKO_DEFAULT_ENCODING_PROPERTY,
CYBERNEKO_DEFAULT_DEFAULT_ENCODING);
+ } catch (SAXNotRecognizedException e) {
+ e.printStackTrace();
+ } catch (SAXNotSupportedException e) {
+ e.printStackTrace();
+ }
+
+ parsers.add(parser);
+ }
+
+ public Document getHtmlDocument(String url, InputStream is,
+ String specifiedEncoding) throws InterruptedException, Exception {
+ Document htmlDoc;
+ DOMParser parser = getParser(specifiedEncoding);
+ try {
+ try {
+ // use cyberneko to parse the html documents (even broken ones)
+ org.xml.sax.InputSource inputSource = new org.xml.sax.InputSource(
+ is);
+ parser.parse(inputSource);
+ } catch (Exception e) {
+ HyperLinkDocumentFetcher.logger.warn("Exception while trying to
parse " + url);
+ throw e;
+ }
+ DOMReader reader = new DOMReader();
+ try {
+ // get the doc that resulted from parsing the text
+ org.w3c.dom.Document document = parser.getDocument();
+ htmlDoc = reader.read(document);
+ } catch (java.lang.StackOverflowError e) {
+ HyperLinkDocumentFetcher.logger.warn("Out of stack memory trying to
parse " + url);
+ throw new Exception(e);
+ }
+ } finally {
+ returnParser(parser);
+ }
+ return htmlDoc;
+ }
+
+ public static URI getBaseURI(Document htmlDocument) throws
URISyntaxException {
+ URI baseUrl = null;
+ Node baseNode = htmlDocument.selectSingleNode("//BASE|//Base|//base");
+ if (null != baseNode) {
+ Attribute href = ((Element) baseNode).attribute("href");
+ if (null == href) {
+ href = ((Element) baseNode).attribute("HREF");
+ if (null == href) {
+ href = ((Element) baseNode).attribute("Href");
+ }
+ }
+ if (null != href) {
+ String base = href.getValue();
+ if (null != base) {
+ baseUrl = new URI(base);
+ }
+ }
+ }
+ return baseUrl;
+ }
+
+ public static void removeNonPrintableTags(Document document) {
+ removeNonPrintableTags(document, document.getRootElement());
+ }
+
+ @SuppressWarnings("unchecked")
+ public static void removeNonPrintableTags(Document document, Element
element) {
+ List children = element.content();
+ for (int i = 0; i < children.size(); i++) {
+ Node node = (Node)children.get(i);
+ if (node instanceof Element) {
+ Element inner = (Element)node;
+ if (inner.getName().equalsIgnoreCase("script") ||
+ inner.getName().equalsIgnoreCase("style")) {
+ element.remove(inner);
+ i--;
+ continue;
+ }
+ removeNonPrintableTags(document, inner);
+ }
+ }
+ }
+
+ public static String extractText(final Element e) {
+ StringBuffer buffer = new StringBuffer();
+ extractText(e, buffer);
+ return buffer.toString();
+ }
+
+ public static void extractText(final Element e, StringBuffer buffer) {
+ //String nodeName = e.getName();
+ if (!(e.getNodeType() == Node.COMMENT_NODE)) {
+ int size = e.nodeCount();
+ for (int i = 0; i < size; i++) {
+ Node node = e.node(i);
+ if (node instanceof Element) {
+ extractText((Element) node, buffer);
+ } else if (node instanceof Text) {
+ String t = node.getText();
+ buffer.append(t);
+ }
+ }
+ }
+ }
+
+}
=======================================
--- /dev/null
+++ /trunk/src/com/flaptor/util/parser/http/HttpUrlFetcher.java Fri Jun 11
10:15:40 2010
@@ -0,0 +1,235 @@
+package com.flaptor.util.parser.http;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.HttpURLConnection;
+import java.net.ProtocolException;
+import java.net.URL;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import com.flaptor.util.Execute;
+import com.flaptor.util.parser.ParseException;
+
+/**
+ * This abstract class provides methods for parsing http requests but
leaves unimplemented
+ * the actual parsing of the response. Subclasses should implement the
{@link #parse(String url, InputStream is, HttpURLConnection connection)}
+ * method and should defined the return type by fixing the type parameter
T. The strategy pattern
+ * is used to define a handler for the response ({@link
HttpResponseHandler}) code of an http connection, implementations may choose
+ * to throw runtime exceptions in some cases, or decide whether the parser
should proceed to invoke
+ * the {@link #fecthAndParse(InputStream)} method or not.
+ *
+ * Subclasses may modify the default HTTP method (GET) by overriding the
{@link #getMethod()} method.
+ *
+ * The abstract class handles http connection, and takes care of handling
timeouts with specific
+ * methods.
+ *
+ * @author Santiago Perez (santip), Ignacio Perez (iperez)
+ *
+ * @param <T>
+ */
+public abstract class HttpUrlFetcher<T> {
+ public interface HttpResponseHandler {
+ /**
+ * Handles response codes for http requests
+ *
+ * @param responseCode the code to be handled
+ *
+ * @return true iff the input should be processed, if false,
+ * input will not be processed and a null value will be
+ * returned by the parser
+ */
+ public boolean handleResponse(int responseCode);
+ }
+
+ private static final String USER_AGENT_HEADER = "User-Agent";
+ private final HttpResponseHandler responseHandler;
+ private final String userAgent;
+
+ /**
+ * Create a new HttpParser with the given handler.
+ *
+ * @param responseHandler the handler for HTTP response codes.
+ * @param userAgent the User-Agent for HTTP request.
+ */
+ public HttpUrlFetcher(HttpResponseHandler responseHandler, String
userAgent) {
+ this.responseHandler = responseHandler;
+ this.userAgent = userAgent;
+ }
+
+ public HttpUrlFetcher(HttpResponseHandler responseHandler) {
+ this(responseHandler, null);
+ }
+
+ /**
+ * Parses the content of the given url.
+ *
+ * @param url the url to parse.
+ * @param timeoutMillis maximum amount of milliseconds to wait before
aborting the parsing.
+ *
+ * @return the result of the parsing, for specific implentations see
documentation on {@link #fecthAndParse(InputStream)}
+ * of the specific class.
+ *
+ * @throws ParseException if an exception occurred during the parsing
of the content.
+ * @throws IOException if there's a problem in the connection with the
given url
+ * @throws TimeoutException if the connection and/or parsing takes
longer than the given timeout
+ * @throws InterruptedException if the thread gets interrupted while
processing.
+ */
+ public T fetchAndParse(final String url, long timeoutMillis) throws
ParseException, IOException, TimeoutException, InterruptedException {
+ return fetchAndParse(url, timeoutMillis, null);
+ }
+
+ /**
+ * Parses the content of the given url.
+ *
+ * @param url the url to parse.
+ * @param timeoutMillis maximum amount of milliseconds to wait before
aborting the parsing.
+ * @param followRedirects override HttpURLConnection followRedirects
parameter.
+ *
+ * @return the result of the parsing, for specific implentations see
documentation on {@link #fecthAndParse(InputStream)}
+ * of the specific class.
+ *
+ * @throws ParseException if an exception occurred during the parsing
of the content.
+ * @throws IOException if there's a problem in the connection with the
given url
+ * @throws TimeoutException if the connection and/or parsing takes
longer than the given timeout
+ * @throws InterruptedException if the thread gets interrupted while
processing.
+ */
+ public T fetchAndParse(final String url, long timeoutMillis, final
Boolean followRedirects) throws ParseException, IOException,
TimeoutException, InterruptedException {
+ Callable<T> parsingTask = new Callable<T>() {
+ public T call() throws Exception {
+ return fecthAndParse(url, followRedirects);
+ }
+ };
+ try {
+ return Execute.executeWithTimeout(parsingTask, timeoutMillis,
TimeUnit.MILLISECONDS, "httpparse");
+ } catch (ExecutionException e) {
+ Execute.checkAndThrow(ParseException.class, e.getCause());
+ Execute.checkAndThrow(IOException.class, e.getCause());
+ Execute.checkAndThrow(RuntimeException.class, e.getCause());
+ Execute.checkAndThrow(Error.class, e.getCause());
+ throw new RuntimeException(e.getCause());
+ }
+ }
+
+ /**
+ * Parses the content of the given url.
+ *
+ * @param url the url to parse
+ *
+ * @return the result of the parsing, for specific implentations see
documentation on {@link #fecthAndParse(InputStream)}
+ * of the specific class.
+ *
+ * @throws ParseException if an exception occurred during the parsing
of the content.
+ * @throws IOException if there's a problem in the connection with the
given url
+ */
+ public T fecthAndParse(String url) throws ParseException, IOException {
+ return fecthAndParse(url, null);
+ }
+
+ /**
+ * Parses the content of the given url.
+ *
+ * @param url the url to parse
+ * @param followRedirects override HttpURLConnection followRedirects
parameter.
+ *
+ * @return the result of the parsing, for specific implentations see
documentation on {@link #fecthAndParse(InputStream)}
+ * of the specific class.
+ *
+ * @throws ParseException if an exception occurred during the parsing
of the content.
+ * @throws IOException if there's a problem in the connection with the
given url
+ */
+ public T fecthAndParse(String url, Boolean followRedirects) throws
ParseException, IOException {
+ if (followRedirects == null) {
+ followRedirects = true;
+ }
+ HttpURLConnection connection = (HttpURLConnection) new
URL(url).openConnection();
+ try {
+ connection.setDoInput(true);
+ prepareConnection(connection, followRedirects);
+ if (shouldDoOutput()) {
+ connection.setDoOutput(true);
+ OutputStream os = connection.getOutputStream();
+ try {
+ writeOutput(os);
+ } finally {
+ Execute.close(os);
+ }
+ }
+
+ if (responseHandler == null ||
responseHandler.handleResponse(connection.getResponseCode())) {
+ InputStream is = connection.getInputStream();
+ try {
+ return parse(url, is, connection);
+ } finally {
+ Execute.close(is);
+ }
+ } else {
+ return null;
+ }
+ } finally {
+ connection.disconnect();
+ }
+ }
+
+
+ /**
+ * Method to be overriden.
+ *
+ * @param is the input stream to parse
+ * @param connection the {@link HttpURLConnection} to be used
+ * @return
+ * @throws ParseException
+ */
+ protected abstract T parse(String url, InputStream is,
HttpURLConnection connection) throws ParseException;
+
+ /**
+ * May be overriden to change the HTTP method.
+ *
+ * @return the HTTP method to use for establishing connections
+ */
+ protected String getMethod() {
+ return "GET";
+ }
+
+ /**
+ * May be overriden to do additional work when establishing a
connection, such
+ * as setting specific headers. Subclasses should invoke it's parent
implementation
+ * of this method.
+ *
+ * @param connection the connection being established
+ *
+ * @throws ProtocolException
+ */
+ protected void prepareConnection(HttpURLConnection connection, Boolean
followRedirects) throws ProtocolException {
+ if (followRedirects != null) {
+ connection.setInstanceFollowRedirects(followRedirects);
+ }
+ connection.setRequestMethod(getMethod());
+ if (userAgent != null) {
+ connection.addRequestProperty(USER_AGENT_HEADER, userAgent);
+ }
+ }
+
+ /**
+ * May be overriden by implementations to define an output in http
connections.
+ * Default implementation returns false.
+ *
+ * @return true iff output should be used
+ */
+ protected boolean shouldDoOutput() {
+ return false;
+ }
+
+ /**
+ * May be overriden in conjunction with {@link #shouldDoOutput()} to
write the
+ * necessary output in the http connection
+ *
+ * @param os the stream where the output should be written
+ */
+ protected void writeOutput(OutputStream os) {
+ }
+
+}
=======================================
--- /dev/null
+++ /trunk/src/com/flaptor/util/parser/http/HyperLinkDocumentFetcher.java
Fri Jun 11 10:15:40 2010
@@ -0,0 +1,141 @@
+package com.flaptor.util.parser.http;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+
+import com.flaptor.util.Execute;
+import com.flaptor.util.parser.ParseException;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+public class HyperLinkDocumentFetcher extends HttpUrlFetcher<FetchResult> {
+ static final Logger logger = Logger.getLogger(Execute.whoAmI());
+
+ public static interface DocumentParser {
+ public FetchResult parse(String url, InputStream inputStream, String
contentType, String encoding, FetchResult currentResult) throws
ParseException;
+ }
+
+ private static final String CONTENT_TYPE_HEADER = "Content-Type";
+
+ public static class ContentType {
+ private Map<String, String> params;
+ public String type;
+
+ public ContentType(Map<String, String> params, String type) {
+ this.params = params;
+ this.type = type;
+ }
+
+ public String getParam(String key) {
+ return params.get(key);
+ }
+ }
+
+ private List<DocumentParser> responsibleParsers = Lists.newArrayList();
+ public HyperLinkDocumentFetcher(HttpResponseHandler responseHandler,
String userAgent) {
+ super(responseHandler, userAgent);
+
+ // initialize responsible parsers
+ responsibleParsers.add(new HtmlParser());
+ responsibleParsers.add(new PdfParser());
+ }
+
+ @Override
+ protected FetchResult parse(String url, InputStream is, HttpURLConnection
connection) throws ParseException {
+ if (url.startsWith("http://") || url.startsWith("https://")) {
+ FetchResult result;
+
+ try {
+ result = new FetchResult(url);
+ } catch (URISyntaxException e) {
+ throw new IllegalArgumentException("Malformed URL: " + url, e);
+ }
+
+ ContentType contentType =
parseContentType(connection.getHeaderField(CONTENT_TYPE_HEADER));
+ String charset = contentType.getParam("charset");
+
+ result.setDocumentType(contentType.type);
+
+ // COR
+ FetchResult finalResult = null;
+ for (DocumentParser parser : responsibleParsers) {
+ finalResult = parser.parse(url, is, contentType.type, charset, result);
+ if (finalResult != null) {
+ return finalResult;
+ }
+ }
+
+ return null;
+ } else {
+ throw new IllegalArgumentException("URL must have http or httpmust have
http or https protocol: " + url);
+ }
+ }
+
+ // This method tries to create an URI from a possibly malformed url.
+ public static URI getURI(String url) throws URISyntaxException {
+ URI uri = null;
+ url = url.trim();
+ if (url.startsWith("file:") || url.startsWith("javascript:")) {
+ logger.debug("Can't handle url: "+url);
+ } else {
+ int p = url.indexOf('?');
+ if (p < 0) {
+ try {
+ uri = new URI(url.replace(" ", "%20"));
+ } catch (java.net.URISyntaxException e) {
+ logger.debug("Malformed URI: "+url);
+ }
+ } else {
+ String base, query;
+ int q = url.lastIndexOf('#');
+ if (q < 0) q = url.length();
+ if (p < q) {
+ base = url.substring(0,p+1);
+ query = url.substring(p+1,q);
+ } else {
+ base = url.substring(0,q)+"?";
+ query = url.substring(p+1);
+ }
+ // Encode any space in the url. Can't use a url encoder because it
would encode stuff like '/' and ':'.
+ base = base.replace(" ", "%20");
+ try {
+ // Re-encode the query part, to handle partially encoded urls.
+ query =
java.net.URLEncoder.encode(java.net.URLDecoder.decode(query,"UTF-8"),"UTF-8");
+ query = query.replace("%3D","=").replace("%26","&");
+ } catch (java.io.UnsupportedEncodingException e) {
+ logger.debug("encoding a url", e);
+ }
+ url = base + query;
+ uri = new URI(url);
+ }
+ }
+ return uri;
+ }
+
+ private static ContentType parseContentType(String contentTypeString) {
+ String[] parts = contentTypeString.split(";");
+ String type = parts[0];
+ Map<String, String> params = Maps.newHashMap();
+ for (int i = 1; i < parts.length; i++) {
+ String[] keyValue = parts[i].split("=");
+ if (keyValue.length == 2) {
+ params.put(keyValue[0], keyValue[1]);
+ }
+ }
+ return new ContentType(params, type);
+ }
+
+ public static void main(String[] args) throws ParseException, IOException
{
+ HyperLinkDocumentFetcher fetcher = new
HyperLinkDocumentFetcher(null, "me");
+ FetchResult result = fetcher.fecthAndParse("http://redpoint.com");
+
+ System.out.println("A:" + result.getLinks());
+ }
+}
=======================================
--- /dev/null
+++ /trunk/src/com/flaptor/util/parser/http/PdfParser.java Fri Jun 11
10:15:40 2010
@@ -0,0 +1,44 @@
+/**
+ *
+ */
+package com.flaptor.util.parser.http;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.util.PDFTextStripper;
+
+import com.flaptor.util.Execute;
+import com.flaptor.util.Pair;
+import com.flaptor.util.parser.ParseException;
+import
com.flaptor.util.parser.http.HyperLinkDocumentFetcher.DocumentParser;
+
+public class PdfParser implements DocumentParser {
+ @Override
+ public FetchResult parse(String url, InputStream inputStream, String
contentType, String encoding, FetchResult currentResult) throws
ParseException {
+ Pair<String, String> textFromPDF;
+ try {
+ textFromPDF = extractTextFromPDF(inputStream);
+ } catch (IOException e) {
+ throw new ParseException(e);
+ }
+ currentResult.setTitle(textFromPDF.first());
+ currentResult.setText(textFromPDF.last());
+
+ return currentResult;
+ }
+
+ public static Pair<String, String> extractTextFromPDF(InputStream is)
throws IOException {
+ PDDocument document = null;
+ try {
+ document = PDDocument.load(is);
+ PDFTextStripper pdfTextStripper = new PDFTextStripper("utf-8");
+ return new Pair<String, String>(document.getDocumentInformation()
+ .getTitle(), pdfTextStripper.getText(document));
+ } finally {
+ Execute.close(document);
+ }
+ }
+
+}

Reply all

Reply to author

Forward

0 new messages