src-d · zurk · Apr 16, 2019 · Apr 12, 2019 · Apr 12, 2019 · Apr 16, 2019
diff --git a/sourced/ml/algorithms/token_parser.py b/sourced/ml/algorithms/token_parser.py
@@ -33,18 +33,22 @@ class TokenParser:
     # if True we have only ["sourc", "algorithm"].
     # if you do not want to filter small tokens set min_split_length=1.
     SAVE_TOKEN_STYLE = False  # whether yield metadata that can be used to reconstruct initial
-    # identifier
+    # identifier.
+    ATTACH_UPPER = True  # True to attach the last of several uppercase letters in a row to
+    # the next token. Example: 'HTMLResponse' -> ["html", "response"] if True,
+    # 'HTMLResponse' -> ["htmlr", "esponse"] if False.
 
     def __init__(self, stem_threshold=STEM_THRESHOLD, max_token_length=MAX_TOKEN_LENGTH,
                  min_split_length=MIN_SPLIT_LENGTH, single_shot=DEFAULT_SINGLE_SHOT,
-                 save_token_style=SAVE_TOKEN_STYLE):
+                 save_token_style=SAVE_TOKEN_STYLE, attach_upper=ATTACH_UPPER):
         self._stemmer = Stemmer.Stemmer("english")
         self._stemmer.maxCacheSize = 0
         self._stem_threshold = stem_threshold
         self._max_token_length = max_token_length
         self._min_split_length = min_split_length
         self._single_shot = single_shot
         self._save_token_style = save_token_style
+        self._attach_upper = attach_upper
         if self._save_token_style and not self._single_shot:
             raise ValueError("Only one of `single_shot`/`save_token_style` should be True")
 
@@ -143,22 +147,22 @@ def ret(name):
                 yield part, TokenStyle.DELIMITER
                 continue
             assert part.isalpha()
-            prev = part[0]
-            pos = 0
+            start = 0
             for i in range(1, len(part)):
                 this = part[i]
+                prev = part[i - 1]
                 if prev.islower() and this.isupper():
-                    yield from ret(part[pos:i])
-                    pos = i
+                    yield from ret(part[start:i])
+                    start = i
                 elif prev.isupper() and this.islower():
-                    if 0 < i - 1 - pos <= self.min_split_length:
-                        yield from ret(part[pos:i])
-                        pos = i
-                    elif i - 1 > pos:
-                        yield from ret(part[pos:i])
-                        pos = i
-                prev = this
-            last = part[pos:]
+                    if self._attach_upper and i > 1 and part[i - 2].isupper():
+                        new_start = i - 1
+                    else:
+                        new_start = i
+                    if i - 1 > start:
+                        yield from ret(part[start:new_start])
+                        start = new_start
+            last = part[start:]
             if last:
                 yield from ret(last)
 
@@ -188,7 +192,7 @@ def __setstate__(self, state):
 
 class NoopTokenParser:
     """
-    One can use this class if he or she does not want to do any parsing.
+    One can use this class one does not want to do any parsing.
     """
 
     def process_token(self, token):

diff --git a/sourced/ml/tests/test_token_parser.py b/sourced/ml/tests/test_token_parser.py
@@ -6,13 +6,15 @@
 
 class TokenParserTests(unittest.TestCase):
     def setUp(self):
-        self.tp = TokenParser(stem_threshold=4, max_token_length=20)
+        self.tp = TokenParser(stem_threshold=4, max_token_length=20, attach_upper=False)
         self.tp._single_shot = False
 
     def test_process_token(self):
         self.tp.max_token_length = 100
 
         tokens = [
+            ("ONLYCAPS", ["onlycap"]),
+            ("nocaps", ["nocap"]),
             ("UpperCamelCase", ["upper", "camel", "case"]),
             ("camelCase", ["camel", "case"]),
             ("FRAPScase", ["frap", "case"]),
@@ -53,11 +55,58 @@ def test_process_token(self):
             res = list(self.tp.process_token(token))
             self.assertEqual(res, correct)
 
+    def test_process_token_with_attach_upper(self):
+        tp = TokenParser(stem_threshold=100, single_shot=True, max_token_length=100,
+                         min_split_length=1)
+        tokens = [
+            ("ONLYCAPS", ["onlycaps"]),
+            ("nocaps", ["nocaps"]),
+            ("UpperCamelCase", ["upper", "camel", "case"]),
+            ("camelCase", ["camel", "case"]),
+            ("FRAPScase", ["frap", "scase"]),
+            ("SQLThing", ["sql", "thing"]),
+            ("_Astra", ["astra"]),
+            ("CAPS_CONST", ["caps", "const"]),
+            ("_something_SILLY_", ["something", "silly"]),
+            ("blink182", ["blink"]),
+            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
+            ("Man45var", ["man", "var"]),
+            ("method_name", ["method", "name"]),
+            ("Method_Name", ["method", "name"]),
+            ("101dalms", ["dalms"]),
+            ("101_dalms", ["dalms"]),
+            ("101_DalmsBug", ["dalms", "bug"]),
+            ("101_Dalms45Bug7", ["dalms", "bug"]),
+            ("wdSize", ["wd", "size"]),
+            ("Glint", ["glint"]),
+            ("foo_BAR", ["foo", "bar"]),
+            ("sourced.ml.algorithms.uast_ids_to_bag",
+             ["sourced", "ml", "algorithms", "uast", "ids", "to", "bag"]),
+            ("WORSTnameYOUcanIMAGINE", ["wors", "tname", "yo", "ucan", "imagine"]),
+            # Another bad example. Parser failed to parse it correctly
+            ("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]),
+            ("SmallIdFooo", ["small", "id", "fooo"]),
+            ("ONE_M0re_.__badId.example", ["one", "m", "re", "bad",
+                                           "id", "example"]),
+            ("never_use_Such__varsableNames", ["never", "use", "such", "varsable", "names"]),
+            ("a.b.c.d", ["a", "b", "c", "d"]),
+            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
+            ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
+            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
+            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"])
+        ]
+
+        for token, correct in tokens:
+            res = list(tp.process_token(token))
+            self.assertEqual(res, correct)
+
     def test_process_token_single_shot(self):
         self.tp.max_token_length = 100
         self.tp._single_shot = True
         self.tp.min_split_length = 1
         tokens = [
+            ("ONLYCAPS", ["onlycap"]),
+            ("nocaps", ["nocap"]),
             ("UpperCamelCase", ["upper", "camel", "case"]),
             ("camelCase", ["camel", "case"]),
             ("FRAPScase", ["frap", "case"]),
@@ -135,6 +184,8 @@ def test_reconstruct(self):
         self.tp.min_split_length = 1
 
         tokens = [
+            "ONLYCAPS",
+            "nocaps",
             "UpperCamelCase",
             "camelCase",
             "FRAPScase",