Friday, October 07, 2005

Another piece of code to walk something - guess what...

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SFCrawler {

public void dlFile(String fileName) throws IOException, InterruptedException{

String base = "http://cvs.sourceforge.net/viewcvs.py/*checkout*/hartmath/hartmath2/";

URL url = new URL(base + fileName);
HttpURLConnection conn = null;
Scanner s;

try {
conn = (HttpURLConnection) url.openConnection();
conn.setReadTimeout(60000);
s = new Scanner(conn.getInputStream());
} catch (IOException e) {
System.err.println(e.getMessage());
Thread.sleep(5000);
if(conn != null)conn.disconnect();
dlFile(fileName); // risky strategy
return;
}

BufferedWriter bw = new BufferedWriter(new FileWriter(fileName));
System.out.println("Writing: " + fileName);
while(s.hasNextLine()){
String line = s.nextLine();
bw.write(line);
bw.newLine();
}
bw.flush();
bw.close();
}

public void crawl(String sUrl, String pdir) throws IOException, InterruptedException{

System.out.println("Crawling - " + sUrl + pdir);
URL url = new URL(sUrl + pdir);

HttpURLConnection conn = null;
Scanner s;
try {
conn = (HttpURLConnection) url.openConnection();
conn.setReadTimeout(60000);
s = new Scanner(conn.getInputStream());
} catch (IOException e) {
System.out.println(e.getMessage());
Thread.sleep(2000);
if(conn != null)conn.disconnect();
crawl(sUrl,pdir); // risky strategy
return;
}


Pattern dirPattern = Pattern.compile("< a name=.*href=\"/viewcvs.py/hartmath/hartmath2/" + pdir + "(.*)/\">");
Pattern filePattern = Pattern.compile("<td><a name=.* href=\"/viewcvs.py/hartmath/hartmath2/" + pdir + "(.*).rev=.*\">");

ArrayList dirs = new ArrayList();

while(s.hasNextLine()){
String line = s.nextLine();

Matcher dirMatcher = dirPattern.matcher(line);
Matcher fileMatcher = filePattern.matcher(line);

if(fileMatcher.matches()){
String file = fileMatcher.group(1);
System.out.println(file);
dlFile(pdir + file);
}else if(dirMatcher.matches()){
String dir = dirMatcher.group(1);
System.out.println(dir);
dirs.add(dir);
}
}


for (String dir : dirs) {
File f = new File(pdir + dir);
if(f.mkdirs()){
System.out.println("Successfully created: " + f.getAbsolutePath());
}else{
System.err.println("Faliure in creating " + f.getAbsolutePath());
}
crawl(sUrl, pdir + dir);
}
}


public static void main(String[] args) {

SFCrawler c = new SFCrawler();

try {
c.crawl("http://cvs.sourceforge.net/viewcvs.py/hartmath/hartmath2/", "src/");
} catch (IOException e) {
System.out.println(e.getMessage());
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}