Java HTML Parsing Example With htmlparser

Every week, I post javablogs top 10 most read blog entries on this blog. The reason for it was that I don’t look at what’s happening on the week-end and this will pickup interesting stories from the weekend, and I also don’t watch javablogs everyday. Overall I find it quite good to be uptodate with interesting stuff happening on javablogs.

As mentionned in an earlier post my library of choice to do the parsing is htmlparser (on sourceforge) because it’s free, open source and because I am lazy and did not want to do my own. If you know a better open source library, feel free to add a comment about it, I’ll be glad to hear about it. htmlparser is not the easiest library to use, there are many entry points and it’s not immediately clear which one to choose. So I post here how I used it if it can save a few minutes to people having to do this task.

  private static Entry parseEntry(String contentthrows ParserException
  {
    final Entry entry = new Entry();

    final NodeVisitor linkVisitor = new NodeVisitor() {
      
      @Override
      public void visitTag(Tag tag) {
        String name = tag.getTagName();

        if ("a".equalsIgnoreCase(name))
            {
              String hrefValue = tag.getAttribute("href");
              if (hrefValue != null && !hrefValue.startsWith("http://"))
              {
                if (!hrefValue.startsWith("/")) hrefValue = "/"+hrefValue;
                hrefValue = "http://javablogs.com"+hrefValue;
                //System.out.println("test, value="+hrefValue);
              }
              if (hrefValue != null)
              {
                hrefValue = hrefValue.replaceAll("&""&");
                tag.setAttribute("href", hrefValue);                
              }
            }
      }
    
    };
    
    NodeVisitor visitor = new NodeVisitor() {

      @Override
      public void visitTag(Tag tag) {        
        String name = tag.getTagName();
            if ("span".equalsIgnoreCase(name|| "div".equalsIgnoreCase(name))
            {              
              String classValue = tag.getAttribute("class");
//                LOGGER.debug("visittag name="+name+" class="+classValue+"children="+tag.getChildren().toHtml());
              if ("blogentrydetails".equals(classValue))
              {
                Pattern countPattern = Pattern.compile("Reads:\s([0-9])");
                Matcher matcher = countPattern.matcher(tag.getChildren().toHtml());
                if (matcher.find())
                {
                  String countStr = matcher.group(1);
                  entry.count = new Integer(countStr).intValue();
                }
                
              }
              else if ("blogentrysummary".equals(classValue))
              {
                try
                {
                  tag.getChildren().visitAllNodesWith(linkVisitor);
                }
                catch (ParserException pe)
                {
                  LOGGER.error(pe,pe);
                }
                entry.description = tag.getChildren().toHtml();                 
                entry.description = entry.description.replaceAll("\s+"" ");
              }
              else if ("blogentrytitle".equals(classValue))
              {
                try
                {
                  tag.getChildren().visitAllNodesWith(linkVisitor);
                }
                catch (ParserException pe)
                {
                  LOGGER.error(pe,pe);
                }
                entry.title =tag.getChildren().toHtml()
                entry.title = entry.title.replaceAll("\s+"" ");
              }              
            }
            
      }

    };
    Parser parser = new Parser(new Lexer(new Page(content,"UTF-8")));
    parser.visitAllNodesWith(visitor);
        if (entry.title != null)
        {
          return entry;
        }
        else return null;
  }

comments powered by Disqus
Tweet Submit to reddit
© 2006-16 Fabien Creative Commons License This work is licensed under a Creative Commons Attribution 4.0 International License.