Skip to content
This repository was archived by the owner on May 22, 2019. It is now read-only.

Add attach last upper to the next token implementation #404

Merged
merged 3 commits into from
Apr 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 19 additions & 15 deletions sourced/ml/algorithms/token_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,22 @@ class TokenParser:
# if True we have only ["sourc", "algorithm"].
# if you do not want to filter small tokens set min_split_length=1.
SAVE_TOKEN_STYLE = False # whether yield metadata that can be used to reconstruct initial
# identifier
# identifier.
ATTACH_UPPER = True # True to attach the last of several uppercase letters in a row to
# the next token. Example: 'HTMLResponse' -> ["html", "response"] if True,
# 'HTMLResponse' -> ["htmlr", "esponse"] if False.

def __init__(self, stem_threshold=STEM_THRESHOLD, max_token_length=MAX_TOKEN_LENGTH,
min_split_length=MIN_SPLIT_LENGTH, single_shot=DEFAULT_SINGLE_SHOT,
save_token_style=SAVE_TOKEN_STYLE):
save_token_style=SAVE_TOKEN_STYLE, attach_upper=ATTACH_UPPER):
self._stemmer = Stemmer.Stemmer("english")
self._stemmer.maxCacheSize = 0
self._stem_threshold = stem_threshold
self._max_token_length = max_token_length
self._min_split_length = min_split_length
self._single_shot = single_shot
self._save_token_style = save_token_style
self._attach_upper = attach_upper
if self._save_token_style and not self._single_shot:
raise ValueError("Only one of `single_shot`/`save_token_style` should be True")

Expand Down Expand Up @@ -143,22 +147,22 @@ def ret(name):
yield part, TokenStyle.DELIMITER
continue
assert part.isalpha()
prev = part[0]
pos = 0
start = 0
for i in range(1, len(part)):
this = part[i]
prev = part[i - 1]
if prev.islower() and this.isupper():
yield from ret(part[pos:i])
pos = i
yield from ret(part[start:i])
start = i
elif prev.isupper() and this.islower():
if 0 < i - 1 - pos <= self.min_split_length:
yield from ret(part[pos:i])
pos = i
elif i - 1 > pos:
yield from ret(part[pos:i])
pos = i
prev = this
last = part[pos:]
if self._attach_upper and i > 1 and part[i - 2].isupper():
new_start = i - 1
else:
new_start = i
if i - 1 > start:
yield from ret(part[start:new_start])
start = new_start
last = part[start:]
if last:
yield from ret(last)

Expand Down Expand Up @@ -188,7 +192,7 @@ def __setstate__(self, state):

class NoopTokenParser:
"""
One can use this class if he or she does not want to do any parsing.
One can use this class one does not want to do any parsing.
"""

def process_token(self, token):
Expand Down
53 changes: 52 additions & 1 deletion sourced/ml/tests/test_token_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@

class TokenParserTests(unittest.TestCase):
def setUp(self):
self.tp = TokenParser(stem_threshold=4, max_token_length=20)
self.tp = TokenParser(stem_threshold=4, max_token_length=20, attach_upper=False)
self.tp._single_shot = False

def test_process_token(self):
self.tp.max_token_length = 100

tokens = [
("ONLYCAPS", ["onlycap"]),
("nocaps", ["nocap"]),
("UpperCamelCase", ["upper", "camel", "case"]),
("camelCase", ["camel", "case"]),
("FRAPScase", ["frap", "case"]),
Expand Down Expand Up @@ -53,11 +55,58 @@ def test_process_token(self):
res = list(self.tp.process_token(token))
self.assertEqual(res, correct)

def test_process_token_with_attach_upper(self):
tp = TokenParser(stem_threshold=100, single_shot=True, max_token_length=100,
min_split_length=1)
tokens = [
("ONLYCAPS", ["onlycaps"]),
("nocaps", ["nocaps"]),
("UpperCamelCase", ["upper", "camel", "case"]),
("camelCase", ["camel", "case"]),
("FRAPScase", ["frap", "scase"]),
("SQLThing", ["sql", "thing"]),
("_Astra", ["astra"]),
("CAPS_CONST", ["caps", "const"]),
("_something_SILLY_", ["something", "silly"]),
("blink182", ["blink"]),
("FooBar100500Bingo", ["foo", "bar", "bingo"]),
("Man45var", ["man", "var"]),
("method_name", ["method", "name"]),
("Method_Name", ["method", "name"]),
("101dalms", ["dalms"]),
("101_dalms", ["dalms"]),
("101_DalmsBug", ["dalms", "bug"]),
("101_Dalms45Bug7", ["dalms", "bug"]),
("wdSize", ["wd", "size"]),
("Glint", ["glint"]),
("foo_BAR", ["foo", "bar"]),
("sourced.ml.algorithms.uast_ids_to_bag",
["sourced", "ml", "algorithms", "uast", "ids", "to", "bag"]),
("WORSTnameYOUcanIMAGINE", ["wors", "tname", "yo", "ucan", "imagine"]),
# Another bad example. Parser failed to parse it correctly
("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]),
("SmallIdFooo", ["small", "id", "fooo"]),
("ONE_M0re_.__badId.example", ["one", "m", "re", "bad",
"id", "example"]),
("never_use_Such__varsableNames", ["never", "use", "such", "varsable", "names"]),
("a.b.c.d", ["a", "b", "c", "d"]),
("A.b.Cd.E", ["a", "b", "cd", "e"]),
("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
("loooong_loooong_loooong", ["loooong", "loooong", "loooong"])
]

for token, correct in tokens:
res = list(tp.process_token(token))
self.assertEqual(res, correct)

def test_process_token_single_shot(self):
self.tp.max_token_length = 100
self.tp._single_shot = True
self.tp.min_split_length = 1
tokens = [
("ONLYCAPS", ["onlycap"]),
("nocaps", ["nocap"]),
("UpperCamelCase", ["upper", "camel", "case"]),
("camelCase", ["camel", "case"]),
("FRAPScase", ["frap", "case"]),
Expand Down Expand Up @@ -135,6 +184,8 @@ def test_reconstruct(self):
self.tp.min_split_length = 1

tokens = [
"ONLYCAPS",
"nocaps",
"UpperCamelCase",
"camelCase",
"FRAPScase",
Expand Down