Extracting Links from HTML – Java Example
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
import java.io.FileReader; import java.util.ArrayList; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML.Attribute; import javax.swing.text.html.HTML.Tag; import javax.swing.text.html.HTMLEditorKit.ParserCallback; import javax.swing.text.html.parser.ParserDelegator; public class Main { public final static void main(String[] args) throws Exception { final ArrayList<String> list = new ArrayList<String> (); ParserDelegator parserDelegator = new ParserDelegator(); ParserCallback parserCallback = new ParserCallback() { public void handleText(final char[] data, final int pos) {} public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { if (tag == Tag.A) { String address = (String) attribute.getAttribute(Attribute.HREF); list.add(address); } } public void handleEndTag(Tag t, final int pos) {} public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) {} public void handleComment(final char[] data, final int pos) {} public void handleError(final java.lang.String errMsg, final int pos) {} }; parserDelegator.parse(new FileReader("a.html"), parserCallback, false); System.out.println(list); } } |