![]() |
The Java Developers Almanac 1.4Order this book from Amazon. |
e1017. Getting the Links in an HTML Document// This method takes a URI which can be either a filename (e.g. file://c:/dir/file.html) // or a URL (e.g. http://host.com/page.html) and returns all HREF links in the document. public static String[] getLinks(String uriStr) { List result = new ArrayList(); try { // Create a reader on the HTML content URL url = new URI(uriStr).toURL(); URLConnection conn = url.openConnection(); Reader rd = new InputStreamReader(conn.getInputStream()); // Parse the HTML EditorKit kit = new HTMLEditorKit(); HTMLDocument doc = (HTMLDocument)kit.createDefaultDocument(); kit.read(rd, doc, 0); // Find all the A elements in the HTML document HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A); while (it.isValid()) { SimpleAttributeSet s = (SimpleAttributeSet)it.getAttributes(); String link = (String)s.getAttribute(HTML.Attribute.HREF); if (link != null) { // Add the link to the result list result.add(link); } it.next(); } } catch (MalformedURLException e) { } catch (URISyntaxException e) { } catch (BadLocationException e) { } catch (IOException e) { } // Return all found links return (String[])result.toArray(new String[result.size()]); }
© 2002 Addison-Wesley. |