00001 package org.hfbk.util;
00002
00003 import java.util.regex.Matcher;
00004 import java.util.regex.Pattern;
00005
00006 import org.apache.lucene.demo.html.Entities;
00007
00013 public class HTMLUtils {
00014
00015
00025 public static String clean(String html){
00026 String tmp="";
00027 for (String between: html.split("<[^<>]*>"))
00028 tmp+=between;
00029
00030 tmp=decodeEntities(tmp);
00031 tmp=tmp.replace('\n', ' ');
00032 tmp=tmp.replace('\t', ' ');
00033 tmp=tmp.replace('\r', ' ');
00034
00035 return tmp;
00036 }
00037
00038
00039 static Matcher punctuationMatcher=Pattern.compile("(^[^\\p{N}\\p{L}])|([^\\p{N}\\p{L}]$)").matcher("");
00041 public static String removePunctuation(String text){
00042 punctuationMatcher.reset(text);
00043 return punctuationMatcher.replaceAll("");
00044 }
00045
00046 static Matcher entityMatcher=Pattern.compile("&[^;]+;").matcher("");
00047
00048 static String decodeEntities(String encoded){
00049 StringBuffer decoded=new StringBuffer();
00050
00051 entityMatcher.reset(encoded);
00052 while(entityMatcher.find()){
00053 String match=entityMatcher.group();
00054 entityMatcher.appendReplacement(decoded, Entities.decode(match));
00055 }
00056 entityMatcher.appendTail(decoded);
00057
00058 return decoded.toString();
00059 }
00060 }