001package org.maltparser.core.symbol.trie; 002 003import java.io.BufferedReader; 004import java.io.BufferedWriter; 005import java.io.FileInputStream; 006import java.io.FileNotFoundException; 007import java.io.FileOutputStream; 008import java.io.UnsupportedEncodingException; 009 010import java.io.IOException; 011import java.io.InputStreamReader; 012import java.io.OutputStreamWriter; 013import java.util.Set; 014import java.util.regex.Pattern; 015import java.util.regex.PatternSyntaxException; 016 017import org.maltparser.core.exception.MaltChainedException; 018import org.maltparser.core.helper.HashMap; 019import org.maltparser.core.symbol.SymbolException; 020import org.maltparser.core.symbol.SymbolTable; 021import org.maltparser.core.symbol.SymbolTableHandler; 022 023 024/** 025 026@author Johan Hall 027*/ 028public class TrieSymbolTableHandler implements SymbolTableHandler { 029 private final Trie trie; 030 private final HashMap<String, TrieSymbolTable> symbolTables; 031 032 public TrieSymbolTableHandler() { 033 trie = new Trie(); 034 symbolTables = new HashMap<String, TrieSymbolTable>(); 035 } 036 037 public TrieSymbolTable addSymbolTable(String tableName) throws MaltChainedException { 038 TrieSymbolTable symbolTable = symbolTables.get(tableName); 039 if (symbolTable == null) { 040 symbolTable = new TrieSymbolTable(tableName, trie); 041 symbolTables.put(tableName, symbolTable); 042 } 043 return symbolTable; 044 } 045 046 public TrieSymbolTable addSymbolTable(String tableName, SymbolTable parentTable) throws MaltChainedException { 047 TrieSymbolTable symbolTable = symbolTables.get(tableName); 048 if (symbolTable == null) { 049 TrieSymbolTable trieParentTable = (TrieSymbolTable)parentTable; 050 symbolTable = new TrieSymbolTable(tableName, trie, trieParentTable.getColumnCategory(), trieParentTable.getNullValueStrategy()); 051 symbolTables.put(tableName, symbolTable); 052 } 053 return symbolTable; 054 } 055 056 public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy) throws MaltChainedException { 057 TrieSymbolTable symbolTable = symbolTables.get(tableName); 058 if (symbolTable == null) { 059 symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy); 060 symbolTables.put(tableName, symbolTable); 061 } 062 return symbolTable; 063 } 064 065 public TrieSymbolTable getSymbolTable(String tableName) { 066 return symbolTables.get(tableName); 067 } 068 069 public Set<String> getSymbolTableNames() { 070 return symbolTables.keySet(); 071 } 072 073 public void cleanUp() { 074 } 075 076 public void save(OutputStreamWriter osw) throws MaltChainedException { 077 try { 078 BufferedWriter bout = new BufferedWriter(osw); 079 for (TrieSymbolTable table : symbolTables.values()) { 080 table.saveHeader(bout); 081 } 082 bout.write('\n'); 083 for (TrieSymbolTable table : symbolTables.values()) { 084 table.save(bout); 085 } 086 bout.close(); 087 } catch (IOException e) { 088 throw new SymbolException("Could not save the symbol tables. ", e); 089 } 090 } 091 092 public void save(String fileName, String charSet) throws MaltChainedException { 093 try { 094 save(new OutputStreamWriter(new FileOutputStream(fileName), charSet)); 095 } catch (FileNotFoundException e) { 096 throw new SymbolException("The symbol table file '"+fileName+"' cannot be created. ", e); 097 } catch (UnsupportedEncodingException e) { 098 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 099 } 100 } 101 102 public void loadHeader(BufferedReader bin) throws MaltChainedException { 103 String fileLine = ""; 104 Pattern tabPattern = Pattern.compile("\t"); 105 try { 106 while ((fileLine = bin.readLine()) != null) { 107 if (fileLine.length() == 0 || fileLine.charAt(0) != '\t') { 108 break; 109 } 110 String items[]; 111 try { 112 items = tabPattern.split(fileLine.substring(1)); 113 } catch (PatternSyntaxException e) { 114 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' could not split into atomic parts. ", e); 115 } 116 if (items.length != 3) { 117 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' must contain four columns. "); 118 } 119 addSymbolTable(items[0], Integer.parseInt(items[1]), items[2]); 120 } 121 } catch (NumberFormatException e) { 122 throw new SymbolException("The symbol table file (.sym) contains a non-integer value in the header. ", e); 123 } catch (IOException e) { 124 throw new SymbolException("Could not load the symbol table. ", e); 125 } 126 } 127 128 129 public void load(InputStreamReader isr) throws MaltChainedException { 130 try { 131 BufferedReader bin = new BufferedReader(isr); 132 String fileLine; 133 SymbolTable table = null; 134 bin.mark(2); 135 if (bin.read() == '\t') { 136 bin.reset(); 137 loadHeader(bin); 138 } else { 139 bin.reset(); 140 } 141 while ((fileLine = bin.readLine()) != null) { 142 if (fileLine.length() > 0) { 143 table = addSymbolTable(fileLine); 144 table.load(bin); 145 } 146 } 147 bin.close(); 148 } catch (IOException e) { 149 throw new SymbolException("Could not load the symbol tables. ", e); 150 } 151 } 152 153 public void load(String fileName, String charSet) throws MaltChainedException { 154 try { 155 load(new InputStreamReader(new FileInputStream(fileName), charSet)); 156 } catch (FileNotFoundException e) { 157 throw new SymbolException("The symbol table file '"+fileName+"' cannot be found. ", e); 158 } catch (UnsupportedEncodingException e) { 159 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 160 } 161 } 162 163 164 public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy) throws MaltChainedException { 165 try { 166 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet)); 167 String fileLine; 168 TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy); 169 170 while ((fileLine = br.readLine()) != null) { 171 table.addSymbol(fileLine.trim()); 172 } 173 br.close(); 174 return table; 175 } catch (FileNotFoundException e) { 176 throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e); 177 } catch (UnsupportedEncodingException e) { 178 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 179 } catch (IOException e) { 180 throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e); 181 } 182 } 183 184 public String printSymbolTables() throws MaltChainedException { 185 StringBuilder sb = new StringBuilder(); 186 for (TrieSymbolTable table : symbolTables.values()) { 187 sb.append(table.printSymbolTable()); 188 } 189 return sb.toString(); 190 } 191}