info.bliki.wiki.dump
Class WikiXMLParser
java.lang.Object
org.xml.sax.helpers.DefaultHandler
info.bliki.wiki.dump.WikiXMLParser
- All Implemented Interfaces:
- org.xml.sax.ContentHandler, org.xml.sax.DTDHandler, org.xml.sax.EntityResolver, org.xml.sax.ErrorHandler
public class WikiXMLParser
- extends org.xml.sax.helpers.DefaultHandler
A Wikipedia XML dump file parser
Original version with permission from Marco Schmidt. See: http://schmidt.devlib.org/software/lucene-wikipedia.html
- Author:
- Marco Schmidt
Method Summary |
void |
characters(char[] ch,
int start,
int length)
parse an unlimited amount of characters between 2 enclosing XML-Tags |
void |
endDocument()
|
void |
endElement(java.lang.String uri,
java.lang.String name,
java.lang.String qName)
|
static java.io.BufferedReader |
getBufferedReader(java.lang.String wikiDumpFilename)
|
void |
parse()
|
void |
startDocument()
|
void |
startElement(java.lang.String namespaceURI,
java.lang.String localName,
java.lang.String qName,
org.xml.sax.Attributes atts)
|
Methods inherited from class org.xml.sax.helpers.DefaultHandler |
endPrefixMapping, error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startPrefixMapping, unparsedEntityDecl, warning |
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
WikiXMLParser
public WikiXMLParser(java.lang.String filename,
IArticleFilter filter)
throws java.io.UnsupportedEncodingException,
java.io.IOException,
org.xml.sax.SAXException,
java.io.FileNotFoundException
- Throws:
java.io.UnsupportedEncodingException
java.io.IOException
org.xml.sax.SAXException
java.io.FileNotFoundException
WikiXMLParser
public WikiXMLParser(java.io.InputStream inputStream,
IArticleFilter filter)
throws org.xml.sax.SAXException
- Throws:
org.xml.sax.SAXException
WikiXMLParser
public WikiXMLParser(java.io.Reader reader,
IArticleFilter filter)
throws org.xml.sax.SAXException
- Throws:
org.xml.sax.SAXException
getBufferedReader
public static java.io.BufferedReader getBufferedReader(java.lang.String wikiDumpFilename)
throws java.io.UnsupportedEncodingException,
java.io.FileNotFoundException,
java.io.IOException
- Returns:
- a BufferedReader created from wikiDumpFilename
- Throws:
java.io.UnsupportedEncodingException
java.io.FileNotFoundException
java.io.IOException
startDocument
public void startDocument()
- Specified by:
startDocument
in interface org.xml.sax.ContentHandler
- Overrides:
startDocument
in class org.xml.sax.helpers.DefaultHandler
endDocument
public void endDocument()
- Specified by:
endDocument
in interface org.xml.sax.ContentHandler
- Overrides:
endDocument
in class org.xml.sax.helpers.DefaultHandler
startElement
public void startElement(java.lang.String namespaceURI,
java.lang.String localName,
java.lang.String qName,
org.xml.sax.Attributes atts)
- Specified by:
startElement
in interface org.xml.sax.ContentHandler
- Overrides:
startElement
in class org.xml.sax.helpers.DefaultHandler
endElement
public void endElement(java.lang.String uri,
java.lang.String name,
java.lang.String qName)
throws org.xml.sax.SAXException
- Specified by:
endElement
in interface org.xml.sax.ContentHandler
- Overrides:
endElement
in class org.xml.sax.helpers.DefaultHandler
- Throws:
org.xml.sax.SAXException
characters
public void characters(char[] ch,
int start,
int length)
throws org.xml.sax.SAXException
- parse an unlimited amount of characters between 2 enclosing XML-Tags
- Specified by:
characters
in interface org.xml.sax.ContentHandler
- Overrides:
characters
in class org.xml.sax.helpers.DefaultHandler
- Throws:
org.xml.sax.SAXException
- See Also:
DefaultHandler.characters(char[], int, int)
parse
public void parse()
throws java.io.IOException,
org.xml.sax.SAXException
- Throws:
java.io.IOException
org.xml.sax.SAXException
Copyright © 2012 Java Wikipedia API (Bliki engine). All Rights Reserved.