00001 package org.hfbk.util; 00002 00003 import java.util.regex.Matcher; 00004 import java.util.regex.Pattern; 00005 00006 import org.apache.lucene.demo.html.Entities; 00007 00013 public class HTMLUtils { 00014 00015 00025 public static String clean(String html){ 00026 String tmp=""; 00027 for (String between: html.split("<[^<>]*>")) 00028 tmp+=between; 00029 00030 tmp=decodeEntities(tmp); 00031 tmp=tmp.replace('\n', ' '); 00032 tmp=tmp.replace('\t', ' '); 00033 tmp=tmp.replace('\r', ' '); 00034 00035 return tmp; 00036 } 00037 00038 00039 static Matcher punctuationMatcher=Pattern.compile("(^[^\\p{N}\\p{L}])|([^\\p{N}\\p{L}]$)").matcher(""); 00041 public static String removePunctuation(String text){ 00042 punctuationMatcher.reset(text); 00043 return punctuationMatcher.replaceAll(""); 00044 } 00045 00046 static Matcher entityMatcher=Pattern.compile("&[^;]+;").matcher(""); 00047 00048 static String decodeEntities(String encoded){ 00049 StringBuffer decoded=new StringBuffer(); 00050 00051 entityMatcher.reset(encoded); 00052 while(entityMatcher.find()){ 00053 String match=entityMatcher.group(); 00054 entityMatcher.appendReplacement(decoded, Entities.decode(match)); 00055 } 00056 entityMatcher.appendTail(decoded); 00057 00058 return decoded.toString(); 00059 } 00060 }