Homework 2 --Crawl Class

2 views
Skip to first unread message

doug chang

unread,
Nov 16, 2010, 5:25:00 PM11/16/10
to hackerdojo-javaconcurrency
Hi Marshall: I have performance issues. Will ask you at dinner. :)

import java.util.concurrent.*;
import org.w3c.dom.*;


//doug chang doug....@hackerdojo.com
public class Crawl {
// private static ConcurrentHashMap<String,String> visitedURLS = new
ConcurrentHashMap<String,String>();

public static void main(String []args) throws ExecutionException,
InterruptedException{
new Crawl();
BlockingQueue<String> links = new LinkedBlockingQueue<String>();
BlockingQueue<Node> downloadedWebPages = new
LinkedBlockingQueue<Node> ();
ConcurrentHashMap<String,String> visitedURLS = new
ConcurrentHashMap<String,String>();

ExecutorService threadPool = Executors.newCachedThreadPool();
links.add("http://www.stanford.edu");
Future<?> f1=null;
Future<?> f2=null;
Future<?> f3=null;
Future<?> f4=null;

while(!threadPool.isShutdown()){
if(!links.isEmpty()){
System.out.println("CRAWL visitedURLS size:"+visitedURLS.size());
f1 = threadPool.submit(new
Crawler(links,downloadedWebPages,visitedURLS));
f1.get();//put his into an exception block?
if(links.size()>100){
f3 = threadPool.submit(new
Crawler(links,downloadedWebPages,visitedURLS));
f3.get();//put his into an exception block?
f4 = threadPool.submit(new
Crawler(links,downloadedWebPages,visitedURLS));
f4.get();//put his into an exception block?
}
}
if(!downloadedWebPages.isEmpty()){
f2 = threadPool.submit(new
Indexer(links,downloadedWebPages,visitedURLS));
f2.get();
}
}
}
}


import java.util.concurrent.*;

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Node;


//doug chang doug....@hackerdojo.com
@net.jcip.annotations.ThreadSafe
public class Crawler implements Runnable {
private static BlockingQueue<String> urlLinks=null;
private static BlockingQueue<Node> urlContent=null;
private static ConcurrentHashMap<String,String> visitedLinks;

private static int numLinks=0;

public Crawler(BlockingQueue<String> links,BlockingQueue<Node> pages,
ConcurrentHashMap<String,String> visited){
this.urlContent = pages;
this.urlLinks = links;
this.visitedLinks = visited;
}

public void crawl() throws Exception{
String getPage = urlLinks.take();
System.out.println("CRAWLER getPage:"+getPage);
DOMParser parser = new DOMParser();
parser.parse(getPage);
Node n = parser.getDocument();
visitedLinks.putIfAbsent(getPage, "");
urlContent.put(n);
}

@Override
public void run() {
try{
crawl();
}catch(Exception e){
Thread.currentThread().interrupt();
}
}

}
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;

import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

// doug chang doug....@hackerdojo.com
public class Indexer implements Runnable {
private static BlockingQueue<String> urlLinks;
private static BlockingQueue<Node> urlContent;
private static ConcurrentHashMap<String,String> links;

public Indexer(BlockingQueue<String> urls, BlockingQueue<Node> pages,
ConcurrentHashMap<String, String> visitedLinks){
this.urlLinks = urls;
this.urlContent = pages;
this.links = visitedLinks;
}

public void index(Node node){
try{
if(node.getNodeName().trim().equals("A")){
NamedNodeMap nm = node.getAttributes();
for(int i=0;i<nm.getLength();i++){
if(nm.item(i).toString().startsWith("href")){
String hrefLink = nm.item(i).toString().replace("href=", "
").replace("\"", " ").trim();
if(!hrefLink.startsWith("http")){
hrefLink=node.getBaseURI()+hrefLink;
System.out.println("indexer href link:"+hrefLink);
}else{
System.out.println("indexer href link:"+hrefLink);
}
//careful, we need the lock on urlLinks, and is it atomic? We
don't want
//funny URLS which didn't have time to complete a write in
urlLinks
if(!links.containsKey(hrefLink)){
System.out.println("INDEXER adding href to queue:"+hrefLink);
System.out.println("INDEXER num links visited:"+links.size());
System.out.println("INDEXER urlLinks:"+urlLinks.size());
System.out.println("INDEXER urlContent:"+urlContent.size());

urlLinks.put(hrefLink);
}

}
}
}
Node child = node.getFirstChild();
while(child!=null){
index(child);
child = child.getNextSibling();
}

}catch(InterruptedException e){
e.printStackTrace();
}
}

@Override
public void run() {
try{
Node node = urlContent.take();
index(node);
}catch(InterruptedException e){
Thread.currentThread().interrupt();
}
}

}

Reply all
Reply to author
Forward
0 new messages