00001 package org.hfbk.vis.source;
00002
00003 import java.io.IOException;
00004 import java.net.URL;
00005 import java.util.regex.Matcher;
00006 import java.util.regex.Pattern;
00007
00008 import org.dronus.graph.Node;
00009 import org.hfbk.util.HTMLUtils;
00010 import org.hfbk.util.HTTPUtils;
00011
00012 public class SourceWikipedia extends SourceRegExp {
00013
00014 public SourceWikipedia(URL url) {
00015 super(url);
00016 root=add(keyword,"keycloud",null);
00017 }
00018
00019 String url(){
00020 return "http://www.google.de/search?btnI=yes&q=inurl:wikipedia.org+"+HTTPUtils.encode(keyword);
00021 }
00022
00023 String matcher(){
00024 return "(?si)(?:href=\"/wiki/([^:\"]*?)\")|(?:src=\"(http://upload.[^\"]*?)\")";
00025 }
00026
00027 void buildGraph() throws IOException {
00028
00029 String page=HTTPUtils.fetch(url(), silent);
00030
00031 page=page.substring(page.indexOf("<!-- start content -->"), page.lastIndexOf("<!-- end content -->"));
00032
00033 Pattern p=Pattern.compile(matcher());
00034 Matcher matcher=p.matcher(page);
00035 while (matcher.find()){
00036 String imgMatch=matcher.group(2);
00037 if(imgMatch!=null) {
00038 String imgUrl=imgMatch.replace("thumb/","");
00039 imgUrl=imgUrl.substring(0,imgUrl.lastIndexOf('/'));
00040 Node img=add(imgUrl,"image",root);
00041 add(imgMatch, "thumbnail", img);
00042 }
00043 else add(matcher.group(1),"keyword",root);
00044 }
00045 }
00046
00047
00048 void parse(String[] matches) throws IOException {}
00049 }