From 87caeed5d45b2c134d6ab9770666218243576174 Mon Sep 17 00:00:00 2001
From: Alex Clemmer <clemmer.alexander@gmail.com>
Date: Fri, 28 Oct 2016 13:11:41 -0700
Subject: [PATCH] Resolve tokenization issues causing BitFunnel parser crashes

The corpus as processed by the current version of Workbench contains
characters (mostly punctuation) that cause the BitFunnel parser to
crash. This commit will cause Workbench to handle these cases correctly.

There are 2 issues at the root of this problem: first, the Lucene
analyzer (which we use to generate the BitFunnel chunk files) attempts
to preserve URLs, and so colons are not removed from the middle of a
term such as `Wikipedia:dump`. This causes our parser to crash. Since
Luecene does remove the colon when it does not seem to appear in a URI,
we simply have removed colons from all terms.

Second, we are not using the Lucene tokenizer to process article titles.
This leaves a wide variety of puntuation in the corpus which crashes the
tokenizer. In the new version of the corpus, the title is tokenized to
avoid such problems.
---
 .../workbench/WikipediaDumpProcessor.java     | 24 +++++++++++++++----
 .../org/bitfunnel/workbench/CorpusTest.java   |  6 ++---
 2 files changed, 23 insertions(+), 7 deletions(-)
diff --git a/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java b/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java
index 171c5b2..28bfc6a 100644
--- a/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java
+++ b/src/main/java/org/bitfunnel/workbench/WikipediaDumpProcessor.java
@@ -92,10 +92,23 @@ private void ProcessDocumentHeader() throws Exception {
 
     int documentId = Integer.parseUnsignedInt(matcher.group(1));
     emit(String.format("%016x", documentId));
-    String title = matcher.group(2);
 
-    try (StreamScope scope = new StreamScope(titleStreamId)) {
-      emit(title);
+    // NOTE: Lucene `StandardTokenizer` removes the ':' character from tokens,
+    // except in the case that they appear to be URIs. For simplicity, we
+    // choose to remove them entirely here.
+    String title = matcher.group(2).replaceAll(":", " ");
+
+
+    try (StreamScope scope = new StreamScope(titleStreamId);
+         TokenStream tokenStream
+              = analyzer.tokenStream("title", new StringReader(title))) {
+      tokenStream.reset();
+
+      CharTermAttribute term =
+        tokenStream.addAttribute(CharTermAttribute.class);
+      while (tokenStream.incrementToken()) {
+        emit(term.toString());
+      }
     }
   }
 
@@ -115,7 +128,10 @@ private void ProcessAllContentLines() throws Exception {
 
 
   private void ProcessOneContentLine() throws IOException {
-    String line = GetLine();
+    // NOTE: Lucene `StandardTokenizer` removes the ':' character from tokens,
+    // except in the case that they appear to be URIs. For simplicity, we
+    // choose to remove them entirely here.
+    String line = GetLine().replaceAll(":", " ");
 
     try (TokenStream tokenStream
              = analyzer.tokenStream("contents", new StringReader(line))) {
diff --git a/src/test/java/org/bitfunnel/workbench/CorpusTest.java b/src/test/java/org/bitfunnel/workbench/CorpusTest.java
index 211ab09..c5308cb 100644
--- a/src/test/java/org/bitfunnel/workbench/CorpusTest.java
+++ b/src/test/java/org/bitfunnel/workbench/CorpusTest.java
@@ -61,15 +61,15 @@ public static Test suite() {
    */
   public void testWikipediaToCorpus() {
     String wikipedia =
-        "<doc id=\"123\" url=\"http://www.bitfunnel.org/123\" title=\"one\">\n" +
-            "This is the body text.\n" +
+        "<doc id=\"123\" url=\"http://www.bitfunnel.org/123\" title=\"w&i|k(i)p-e\\d:ia two\">\n" +
+            "This is the body:text.\n" +
             "</doc>\n" +
             "<doc id=\"456\" url=\"http://www.bitfunnel.org/456\" title=\"two\">\n" +
             "Some more body text.\n" +
             "</doc>\n";
 
     byte[] expected =
-        ("000000000000007b\00000\000one\000\00001\000body\000text\000\000\000" +
+        ("000000000000007b\00000\000w\000i\000k\000i\000p\000e\000d\000ia\000two\000\00001\000body\000text\000\000\000" +
             "00000000000001c8\00000\000two\000\00001\000some\000more\000body\000text\000\000\000" +
             "\000").getBytes(StandardCharsets.UTF_8);