Skip to content

Commit 3b56e8b

Browse files
committed
Initial commit.
1 parent f892e01 commit 3b56e8b

10 files changed

+653
-0
lines changed

find_collocations.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/python
2+
import sys
3+
import csv
4+
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
5+
from nltk.text import *
6+
import wsd
7+
8+
if len(sys.argv) != 4:
9+
print "Usage:", sys.argv[0], "word sense1 sense2"
10+
exit(-1)
11+
12+
corpus = PlaintextCorpusReader('outcorpus/', '.*')
13+
focal_word = sys.argv[1]
14+
senses = [sys.argv[2], sys.argv[3]]
15+
#senses = ["manufacturing","life"]
16+
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
17+
18+
with open("senses_" + focal_word + ".csv") as senses_file:
19+
reader = csv.reader(senses_file)
20+
for row in reader:
21+
infile, offset, sense = row
22+
offset = int(offset)
23+
words = corpus.words(infile)
24+
text = Text(words)
25+
26+
for collocation in collocations:
27+
collocation.add_collocation(text, offset, sense)
28+
29+
30+
#print collocations[0].frequencies.items()[0][1].items()[0][1]
31+
32+
decision_list = wsd.DecisionList()
33+
print collocations[0].frequencies
34+
print collocations[0].update_decision_list(decision_list)
35+
print decision_list.decision_items
36+
print ""
37+
print collocations[1].frequencies
38+
print collocations[1].update_decision_list(decision_list)
39+
print decision_list.decision_items
40+
print ""
41+
print collocations[2].frequencies
42+
print collocations[2].update_decision_list(decision_list)
43+
print decision_list.decision_items
44+
decision_list.save("senses_bootstrap_" + focal_word + ".csv")

make_string_list.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/usr/bin/python
2+
import sys
3+
4+
if len(sys.argv) != 2:
5+
print "Usage:", sys.argv[0], "file"
6+
exit(-1)
7+
8+
filename = sys.argv[1]
9+
lines = [line.strip() for line in open(filename)]
10+
list = ','.join('"' + str(x) + '"' for x in lines)
11+
print list

normalize_text.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/python
2+
import sys
3+
from os import listdir
4+
from os.path import isfile, join
5+
import nltk
6+
from nltk.tokenize import RegexpTokenizer
7+
from nltk.corpus import *
8+
from nltk.stem.porter import *
9+
10+
if len(sys.argv) != 3:
11+
print "Usage:", sys.argv[0], "in_dir out_dir"
12+
exit(-1)
13+
14+
in_dir = sys.argv[1]
15+
out_dir = sys.argv[2]
16+
filenames = [join(in_dir, f) for f in listdir(in_dir) if isfile(join(in_dir, f))]
17+
18+
is_text = False
19+
text = ""
20+
corpus_index = 1
21+
tokenizer = RegexpTokenizer(r'\w+')
22+
stemmer = PorterStemmer()
23+
24+
def extract_text(line):
25+
global is_text, text, corpus_index, tokenizer, stemmer
26+
if line.startswith("<TEXT>"):
27+
is_text = True
28+
elif line.startswith("</TEXT>"):
29+
tokens = tokenizer.tokenize(text)
30+
31+
#stop = set(stopwords.words("english"))
32+
#words = [w for w in tokens if w not in stop]
33+
34+
#words_stemmed = []
35+
#for word in words:
36+
# words_stemmed.append(stemmer.stem(word))
37+
38+
#final_text = ' '.join(words_stemmed)
39+
final_text = ' '.join(tokens)
40+
#print final_text # FOR debugging
41+
42+
if len(final_text) > 0:
43+
out_file = open(join(out_dir, "corpus" + str(corpus_index)), "w")
44+
out_file.write(final_text)
45+
out_file.close()
46+
47+
is_text = False
48+
text = ""
49+
corpus_index += 1
50+
elif is_text == True:
51+
text = text + line.strip().lower() + " "
52+
return None
53+
54+
for filename in sorted(filenames):
55+
print filename
56+
[extract_text(line) for line in open(filename)]
57+
#filename = filenames[0]
58+
#[extract_text(line) for line in open(filename)]

senses_plant.csv

+212
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
corpus20065,110,manufacturing
2+
corpus20091,44,manufacturing
3+
corpus20091,200,manufacturing
4+
corpus20093,273,manufacturing
5+
corpus20093,335,manufacturing
6+
corpus20106,439,manufacturing
7+
corpus20106,457,manufacturing
8+
corpus20108,66,manufacturing
9+
corpus20115,32,life
10+
corpus20120,39,manufacturing
11+
corpus20120,66,manufacturing
12+
corpus20137,4,manufacturing
13+
corpus20143,32,manufacturing
14+
corpus20143,39,manufacturing
15+
corpus20143,214,manufacturing
16+
corpus20143,228,manufacturing
17+
corpus20143,264,manufacturing
18+
corpus20143,267,manufacturing
19+
corpus20143,306,manufacturing
20+
corpus20143,312,manufacturing
21+
corpus20143,353,manufacturing
22+
corpus20145,16,manufacturing
23+
corpus20145,60,manufacturing
24+
corpus20145,132,manufacturing
25+
corpus20145,279,manufacturing
26+
corpus20145,282,manufacturing
27+
corpus20153,64,manufacturing
28+
corpus20153,523,manufacturing
29+
corpus20158,128,manufacturing
30+
corpus2016,438,manufacturing
31+
corpus20187,51,manufacturing
32+
corpus20189,0,manufacturing
33+
corpus20189,21,manufacturing
34+
corpus2019,3,manufacturing
35+
corpus2019,24,manufacturing
36+
corpus2019,46,manufacturing
37+
corpus20215,127,manufacturing
38+
corpus20237,372,manufacturing
39+
corpus2029,96,life
40+
corpus20321,78,manufacturing
41+
corpus20321,156,manufacturing
42+
corpus20321,167,manufacturing
43+
corpus20321,183,manufacturing
44+
corpus20361,16,manufacturing
45+
corpus20361,44,manufacturing
46+
corpus20361,74,manufacturing
47+
corpus20361,83,manufacturing
48+
corpus20361,120,manufacturing
49+
corpus20361,290,manufacturing
50+
corpus20361,318,manufacturing
51+
corpus20361,329,manufacturing
52+
corpus20361,401,manufacturing
53+
corpus20365,391,manufacturing
54+
corpus20394,429,life
55+
corpus20394,435,life
56+
corpus20417,94,life
57+
corpus20432,235,manufacturing
58+
corpus20448,280,manufacturing
59+
corpus20451,10,manufacturing
60+
corpus20451,24,manufacturing
61+
corpus20451,67,manufacturing
62+
corpus20451,95,manufacturing
63+
corpus20451,159,manufacturing
64+
corpus20469,21,life
65+
corpus20469,44,life
66+
corpus20469,74,life
67+
corpus20469,80,life
68+
corpus20485,100,manufacturing
69+
corpus2049,216,manufacturing
70+
corpus20508,5,manufacturing
71+
corpus20508,85,manufacturing
72+
corpus20512,72,manufacturing
73+
corpus20512,146,manufacturing
74+
corpus20542,108,manufacturing
75+
corpus20542,231,manufacturing
76+
corpus20542,238,manufacturing
77+
corpus20559,12,manufacturing
78+
corpus20559,18,manufacturing
79+
corpus20559,98,manufacturing
80+
corpus20559,244,manufacturing
81+
corpus20559,250,manufacturing
82+
corpus20566,186,manufacturing
83+
corpus10037,197,manufacturing
84+
corpus10037,215,manufacturing
85+
corpus10037,250,manufacturing
86+
corpus1011,6,life
87+
corpus10124,237,life
88+
corpus10189,40,manufacturing
89+
corpus10189,176,manufacturing
90+
corpus1019,295,manufacturing
91+
corpus10263,110,life
92+
corpus10270,82,manufacturing
93+
corpus10287,3,life
94+
corpus10287,32,life
95+
corpus10287,36,life
96+
corpus10303,350,life
97+
corpus10312,47,manufacturing
98+
corpus10315,342,manufacturing
99+
corpus10327,283,manufacturing
100+
corpus1034,89,manufacturing
101+
corpus1034,105,manufacturing
102+
corpus1034,116,manufacturing
103+
corpus1034,128,manufacturing
104+
corpus1034,138,manufacturing
105+
corpus1034,207,manufacturing
106+
corpus1034,258,manufacturing
107+
corpus1034,285,manufacturing
108+
corpus10395,200,manufacturing
109+
corpus10395,209,manufacturing
110+
corpus10395,223,manufacturing
111+
corpus10395,227,manufacturing
112+
corpus10395,271,manufacturing
113+
corpus10395,277,manufacturing
114+
corpus10395,306,manufacturing
115+
corpus10395,314,manufacturing
116+
corpus10395,366,manufacturing
117+
corpus10395,379,manufacturing
118+
corpus10399,49,manufacturing
119+
corpus10404,103,manufacturing
120+
corpus10411,31,manufacturing
121+
corpus10411,59,manufacturing
122+
corpus10411,105,manufacturing
123+
corpus10411,109,manufacturing
124+
corpus10411,181,manufacturing
125+
corpus10411,186,manufacturing
126+
corpus10411,218,manufacturing
127+
corpus10411,223,manufacturing
128+
corpus10411,228,manufacturing
129+
corpus10412,87,manufacturing
130+
corpus10412,103,manufacturing
131+
corpus1042,665,manufacturing
132+
corpus10423,12,manufacturing
133+
corpus10425,218,manufacturing
134+
corpus10436,18,manufacturing
135+
corpus10436,20,manufacturing
136+
corpus10436,31,manufacturing
137+
corpus10436,65,manufacturing
138+
corpus10436,161,manufacturing
139+
corpus10436,183,manufacturing
140+
corpus10436,196,manufacturing
141+
corpus10436,225,manufacturing
142+
corpus10436,292,manufacturing
143+
corpus10438,18,manufacturing
144+
corpus10438,20,manufacturing
145+
corpus10438,31,manufacturing
146+
corpus10438,67,manufacturing
147+
corpus10438,163,manufacturing
148+
corpus10438,185,manufacturing
149+
corpus10438,198,manufacturing
150+
corpus10438,227,manufacturing
151+
corpus10438,294,manufacturing
152+
corpus10475,317,life
153+
corpus10485,26,manufacturing
154+
corpus10485,95,manufacturing
155+
corpus10485,132,manufacturing
156+
corpus10485,136,manufacturing
157+
corpus105,273,manufacturing
158+
corpus10508,295,manufacturing
159+
corpus10519,33,manufacturing
160+
corpus10519,173,manufacturing
161+
corpus10548,172,manufacturing
162+
corpus10549,243,manufacturing
163+
corpus10566,302,manufacturing
164+
corpus10570,14,manufacturing
165+
corpus10570,126,manufacturing
166+
corpus10570,160,manufacturing
167+
corpus10570,265,manufacturing
168+
corpus10570,277,manufacturing
169+
corpus10570,310,manufacturing
170+
corpus10580,189,manufacturing
171+
corpus10591,267,life
172+
corpus10591,360,life
173+
corpus10591,415,life
174+
corpus1065,14,manufacturing
175+
corpus1065,91,manufacturing
176+
corpus1065,234,manufacturing
177+
corpus10696,143,manufacturing
178+
corpus10725,320,manufacturing
179+
corpus10739,269,life
180+
corpus10741,306,manufacturing
181+
corpus10753,114,life
182+
corpus1076,198,life
183+
corpus10779,116,manufacturing
184+
corpus10782,51,life
185+
corpus10783,245,life
186+
corpus10795,147,life
187+
corpus10810,64,life
188+
corpus10819,52,life
189+
corpus10893,15,life
190+
corpus10937,429,manufacturing
191+
corpus10964,31,manufacturing
192+
corpus10969,6,life
193+
corpus10969,30,life
194+
corpus10969,84,manufacturing
195+
corpus10980,18,manufacturing
196+
corpus10980,47,manufacturing
197+
corpus10980,78,manufacturing
198+
corpus10980,98,manufacturing
199+
corpus10980,140,manufacturing
200+
corpus10980,147,manufacturing
201+
corpus10980,234,manufacturing
202+
corpus10980,274,manufacturing
203+
corpus10980,366,manufacturing
204+
corpus10980,374,manufacturing
205+
corpus11004,143,manufacturing
206+
corpus1101,9,life
207+
corpus1101,24,life
208+
corpus1101,81,life
209+
corpus11010,309,life
210+
corpus11010,320,manufacturing
211+
corpus11035,155,manufacturing
212+
corpus11057,74,manufacturing

tag.sh

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/sh
2+
3+
INCORPUS=corpus
4+
OUTCORPUS=outcorpus
5+
WORD=plant
6+
SENSES=manufacturing life
7+
8+
./normalize_text.py "$INCORPUS" "$OUTCORPUS"
9+
./tag_sense_manual.py "$OUTCORPUS" "$WORD" "$SENSES"
10+
./find_collocations.py "$WORD" "$SENSES"
11+
./tag_senses_bootstrap.py "$WORD" "$SENSES"
12+
./verify.py "$WORD" "$SENSES"

tag_senses_bootstrap.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/usr/bin/python
2+
import sys
3+
import wsd
4+
import nltk
5+
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
6+
from nltk.text import *
7+
8+
if len(sys.argv) != 4:
9+
print "Usage:", sys.argv[0], "word sense1 sense2"
10+
exit(-1)
11+
12+
focal_word = sys.argv[2]
13+
senses = [sys.argv[3], sys.argv[4]]
14+
#focal_word = "plant"
15+
#senses = ["manufacturing","life"]
16+
corpus = PlaintextCorpusReader('outcorpus/', '.*')
17+
collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ]
18+
decision_list = wsd.DecisionList()
19+
decision_list.load("senses_bootstrap_" + focal_word + ".csv")
20+
21+
i = 0
22+
for infile in sorted(corpus.fileids()):
23+
print i, "/", len(corpus.fileids())
24+
i += 1
25+
26+
words = corpus.words(infile)
27+
text = Text(words)
28+
c = nltk.ConcordanceIndex(text.tokens)
29+
offsets = c.offsets(focal_word)
30+
31+
for offset in offsets:
32+
for collocation in collocations:
33+
tokens = collocation.get_collocation(text, offset)
34+
if tokens == None: continue
35+
sense = decision_list.get_sense(tokens, collocation.index)
36+
if sense == None: continue
37+
collocation.add_collocation(text, offset, sense)
38+
collocation.update_decision_list(decision_list)
39+
#decision_list.add_sense(sense, tokens, collocation.index, score)
40+
print sense
41+
42+
decision_list.save("senses_bootstrap_" + focal_word + ".csv")
43+

0 commit comments

Comments
 (0)