This repository was archived by the owner on May 9, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
123 lines (95 loc) · 4.09 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
from os import listdir
from os.path import join
import numpy as np
from sklearn import neural_network
from sklearn.metrics import confusion_matrix
import scisummgen
training_path = 'scisumm-corpus/Training-Set-2017'
test_path = 'scisumm-corpus/Test-Set-2017'
def create_features(path):
X = []
y = []
# For all the papers
for directory in listdir(path):
paper = scisummgen.Paper(join(path, directory))
similarity = scisummgen.Similarity(paper)
# For all the citances
for citance in paper.annotation.citances:
# For all the sentences of the reference paper
for sentence_sid, sentence in paper.reference.sentences.items():
citance_text = paper.get_citance_text(citance)
tfidf = similarity.tfidf_similarity(sentence['text'], citance_text)
lsi = similarity.lsi_similarity(sentence['text'], citance_text)
bigrams = similarity.count_bigrams(sentence['text'], citance_text)
try:
sid_pos = sentence['sid'] / sentence['sid_max']
except ZeroDivisionError:
sid_pos = 0
try:
ssid_pos = sentence['ssid'] / sentence['ssid_max']
except ZeroDivisionError:
ssid_pos = 0
try:
section_pos = sentence['section'] / sentence['section_max']
except ZeroDivisionError:
section_pos = 0
X.append([tfidf, lsi, bigrams, sid_pos, ssid_pos, section_pos])
# Check if this sentence is also a provenance
if sentence_sid in citance['RO']:
y.append(1)
else:
y.append(-1)
return X, y
def create_summaries(path, y_probability):
y_index = 0
# For all the test papers
for directory in listdir(path):
print('Creating summary for paper', directory)
paper = scisummgen.Paper(join(path, directory))
sentence_scores = np.zeros(next(iter(paper.reference.sentences.values()))['sid_max'] + 1)
# For all the citances
for citance in paper.annotation.citances:
# For all the sentences of the reference paper
for sentence_sid, sentence in paper.reference.sentences.items():
# The probability of being a provenance
sentence_scores[sentence_sid] += y_probability[y_index][1]
y_index += 1
# Find the sentences with the highest scores
sentence_sid_sorted = sentence_scores.argsort()[::-1]
summary_sentences = []
tot_words = 0
for sid in sentence_sid_sorted:
sentence = paper.reference.sentences[sid]
# Avoid considering the title
if sentence['sid'] == 0:
continue
# Count the number of words
words = len(re.findall('\w+', sentence['text'].lower()))
tot_words += words
if tot_words <= 250:
summary_sentences.append(sentence)
else:
break
# Sort sentences by sid
summary_sentences.sort(key=lambda x: x['sid'])
# Create the summary
summary = ''
for sentence in summary_sentences:
summary += sentence['text'] + ' '
summary = summary.strip(' ')
with open(join('summary', directory + '.system.txt'), 'w', encoding='utf-8') as file:
file.write(summary)
print('Creating features for the training set')
X_training, y_training = create_features(training_path)
print('Creating features for the test set')
X_test, y_test = create_features(test_path)
print('Training the classifier')
clf = neural_network.MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100),
learning_rate_init=0.0001, verbose=True)
clf = clf.fit(X_training, y_training)
y_prediction = clf.predict(X_test)
conf_mat = confusion_matrix(y_test, y_prediction)
print('Confusion matrix on test set')
print(conf_mat)
create_summaries(test_path, clf.predict_proba(X_test))