info.bliki.wiki.dump
Class WikiXMLParser

java.lang.Object
  extended by org.xml.sax.helpers.DefaultHandler
      extended by info.bliki.wiki.dump.WikiXMLParser
All Implemented Interfaces:
org.xml.sax.ContentHandler, org.xml.sax.DTDHandler, org.xml.sax.EntityResolver, org.xml.sax.ErrorHandler

public class WikiXMLParser
extends org.xml.sax.helpers.DefaultHandler

A Wikipedia XML dump file parser Original version with permission from Marco Schmidt. See: http://schmidt.devlib.org/software/lucene-wikipedia.html

Author:
Marco Schmidt

Constructor Summary
WikiXMLParser(java.io.InputStream inputStream, IArticleFilter filter)
           
WikiXMLParser(java.io.Reader reader, IArticleFilter filter)
           
WikiXMLParser(java.lang.String filename, IArticleFilter filter)
           
 
Method Summary
 void characters(char[] ch, int start, int length)
          parse an unlimited amount of characters between 2 enclosing XML-Tags
 void endDocument()
           
 void endElement(java.lang.String uri, java.lang.String name, java.lang.String qName)
           
static java.io.BufferedReader getBufferedReader(java.lang.String wikiDumpFilename)
           
 void parse()
           
 void startDocument()
           
 void startElement(java.lang.String namespaceURI, java.lang.String localName, java.lang.String qName, org.xml.sax.Attributes atts)
           
 
Methods inherited from class org.xml.sax.helpers.DefaultHandler
endPrefixMapping, error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startPrefixMapping, unparsedEntityDecl, warning
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

WikiXMLParser

public WikiXMLParser(java.lang.String filename,
                     IArticleFilter filter)
              throws java.io.UnsupportedEncodingException,
                     java.io.IOException,
                     org.xml.sax.SAXException,
                     java.io.FileNotFoundException
Throws:
java.io.UnsupportedEncodingException
java.io.IOException
org.xml.sax.SAXException
java.io.FileNotFoundException

WikiXMLParser

public WikiXMLParser(java.io.InputStream inputStream,
                     IArticleFilter filter)
              throws org.xml.sax.SAXException
Throws:
org.xml.sax.SAXException

WikiXMLParser

public WikiXMLParser(java.io.Reader reader,
                     IArticleFilter filter)
              throws org.xml.sax.SAXException
Throws:
org.xml.sax.SAXException
Method Detail

getBufferedReader

public static java.io.BufferedReader getBufferedReader(java.lang.String wikiDumpFilename)
                                                throws java.io.UnsupportedEncodingException,
                                                       java.io.FileNotFoundException,
                                                       java.io.IOException
Returns:
a BufferedReader created from wikiDumpFilename
Throws:
java.io.UnsupportedEncodingException
java.io.FileNotFoundException
java.io.IOException

startDocument

public void startDocument()
Specified by:
startDocument in interface org.xml.sax.ContentHandler
Overrides:
startDocument in class org.xml.sax.helpers.DefaultHandler

endDocument

public void endDocument()
Specified by:
endDocument in interface org.xml.sax.ContentHandler
Overrides:
endDocument in class org.xml.sax.helpers.DefaultHandler

startElement

public void startElement(java.lang.String namespaceURI,
                         java.lang.String localName,
                         java.lang.String qName,
                         org.xml.sax.Attributes atts)
Specified by:
startElement in interface org.xml.sax.ContentHandler
Overrides:
startElement in class org.xml.sax.helpers.DefaultHandler

endElement

public void endElement(java.lang.String uri,
                       java.lang.String name,
                       java.lang.String qName)
                throws org.xml.sax.SAXException
Specified by:
endElement in interface org.xml.sax.ContentHandler
Overrides:
endElement in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

characters

public void characters(char[] ch,
                       int start,
                       int length)
                throws org.xml.sax.SAXException
parse an unlimited amount of characters between 2 enclosing XML-Tags

Specified by:
characters in interface org.xml.sax.ContentHandler
Overrides:
characters in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException
See Also:
DefaultHandler.characters(char[], int, int)

parse

public void parse()
           throws java.io.IOException,
                  org.xml.sax.SAXException
Throws:
java.io.IOException
org.xml.sax.SAXException


Copyright © 2012 Java Wikipedia API (Bliki engine). All Rights Reserved.