VisClient/org/hfbk/util/HTMLUtils.java

Go to the documentation of this file.
00001 package org.hfbk.util;
00002 
00003 import java.util.regex.Matcher;
00004 import java.util.regex.Pattern;
00005 
00006 import org.apache.lucene.demo.html.Entities;
00007 
00013 public class HTMLUtils {
00014         
00015         
00025         public static String clean(String html){
00026                 String tmp="";          
00027                 for (String between: html.split("<[^<>]*>"))
00028                         tmp+=between;
00029 
00030                 tmp=decodeEntities(tmp);
00031                 tmp=tmp.replace('\n', ' ');
00032                 tmp=tmp.replace('\t', ' ');
00033                 tmp=tmp.replace('\r', ' ');
00034                 
00035                 return tmp;
00036         }
00037         
00038         
00039         static Matcher punctuationMatcher=Pattern.compile("(^[^\\p{N}\\p{L}])|([^\\p{N}\\p{L}]$)").matcher("");
00041         public static String removePunctuation(String text){
00042                 punctuationMatcher.reset(text);
00043                 return punctuationMatcher.replaceAll("");
00044         }
00045         
00046         static Matcher entityMatcher=Pattern.compile("&[^;]+;").matcher("");
00047         
00048         static String decodeEntities(String encoded){
00049                 StringBuffer decoded=new StringBuffer();
00050 
00051                 entityMatcher.reset(encoded);
00052                 while(entityMatcher.find()){
00053                         String match=entityMatcher.group();
00054                         entityMatcher.appendReplacement(decoded, Entities.decode(match));                       
00055                 }
00056                 entityMatcher.appendTail(decoded);
00057                 
00058                 return decoded.toString();              
00059         }
00060 }

Generated on Tue Apr 7 17:57:20 2009 for visclient by  doxygen 1.5.1