import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SFCrawler {
public void dlFile(String fileName) throws IOException, InterruptedException{
String base = "http://cvs.sourceforge.net/viewcvs.py/*checkout*/hartmath/hartmath2/";
URL url = new URL(base + fileName);
HttpURLConnection conn = null;
Scanner s;
try {
conn = (HttpURLConnection) url.openConnection();
conn.setReadTimeout(60000);
s = new Scanner(conn.getInputStream());
} catch (IOException e) {
System.err.println(e.getMessage());
Thread.sleep(5000);
if(conn != null)conn.disconnect();
dlFile(fileName); // risky strategy
return;
}
BufferedWriter bw = new BufferedWriter(new FileWriter(fileName));
System.out.println("Writing: " + fileName);
while(s.hasNextLine()){
String line = s.nextLine();
bw.write(line);
bw.newLine();
}
bw.flush();
bw.close();
}
public void crawl(String sUrl, String pdir) throws IOException, InterruptedException{
System.out.println("Crawling - " + sUrl + pdir);
URL url = new URL(sUrl + pdir);
HttpURLConnection conn = null;
Scanner s;
try {
conn = (HttpURLConnection) url.openConnection();
conn.setReadTimeout(60000);
s = new Scanner(conn.getInputStream());
} catch (IOException e) {
System.out.println(e.getMessage());
Thread.sleep(2000);
if(conn != null)conn.disconnect();
crawl(sUrl,pdir); // risky strategy
return;
}
Pattern dirPattern = Pattern.compile("< a name=.*href=\"/viewcvs.py/hartmath/hartmath2/" + pdir + "(.*)/\">");
Pattern filePattern = Pattern.compile("<td><a name=.* href=\"/viewcvs.py/hartmath/hartmath2/" + pdir + "(.*).rev=.*\">");
ArrayList
while(s.hasNextLine()){
String line = s.nextLine();
Matcher dirMatcher = dirPattern.matcher(line);
Matcher fileMatcher = filePattern.matcher(line);
if(fileMatcher.matches()){
String file = fileMatcher.group(1);
System.out.println(file);
dlFile(pdir + file);
}else if(dirMatcher.matches()){
String dir = dirMatcher.group(1);
System.out.println(dir);
dirs.add(dir);
}
}
for (String dir : dirs) {
File f = new File(pdir + dir);
if(f.mkdirs()){
System.out.println("Successfully created: " + f.getAbsolutePath());
}else{
System.err.println("Faliure in creating " + f.getAbsolutePath());
}
crawl(sUrl, pdir + dir);
}
}
public static void main(String[] args) {
SFCrawler c = new SFCrawler();
try {
c.crawl("http://cvs.sourceforge.net/viewcvs.py/hartmath/hartmath2/", "src/");
} catch (IOException e) {
System.out.println(e.getMessage());
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
1 comment:
Nice bit of code cheers :-)
Flat House Share
Post a Comment