This repository was archived by the owner on May 7, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathstackoverflow_dataset.py
executable file
·103 lines (88 loc) · 4.01 KB
/
stackoverflow_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from typing import List, Dict
from haystack.schema import Document
from haystack.nodes.other import Dataset
import time
import ray
import modin.pandas as modin_pd
import pandas as pd
from bs4 import BeautifulSoup
from typing import Dict, List, Optional
import os
os.environ["__MODIN_AUTOIMPORT_PANDAS__"] = "1"
def _generate_documents(batch: pd.DataFrame) -> List[Dict]:
docs_to_index = batch.to_dict(orient="records")
rest_keys = list(docs_to_index[0].keys())
rest_keys.remove('text')
docs = [{'content': d['text'], 'meta': {k: d[k] for k in rest_keys}} for d in docs_to_index]
documents = [Document.from_dict(d) for d in docs]
return documents
class StackoverflowDataset(Dataset):
"""
This Node is used to convert stackoverflow dataset into ray.data.Dataset of Haystack Document format.
"""
outgoing_edges = 1
def __init__(self,
question_file: str,
answer_file: str,
batch_size: Optional[int] = 4096,
) :
super().__init__(batch_size=batch_size)
self.question_file = question_file
self.answer_file = answer_file
def convert(self) -> ray.data.Dataset:
print('start to read answers...')
start = time.time()
answers = modin_pd.read_csv(self.answer_file, encoding="ISO-8859-1")
time1 = time.time()
print(f"read answer csv time ={time1-start}");
answers.drop(columns=['OwnerUserId', 'CreationDate', 'Id'], inplace=True)
time2 = time.time()
print(f"answer drop time ={time2-time1}");
answers = ray.data.from_modin(answers)
time3 = time.time()
print(f"convert pandas to ray df time ={time3-time2}");
print('start to process answers...')
top_answers = answers.groupby("ParentId").map_groups(lambda g: g[g['Score'] == g['Score'].max()])
time4 = time.time()
print(f"answer groupby time ={time4-time3}");
top_answers = top_answers.to_modin()
time5 = time.time()
print(f"convert ray df to modin time ={time5-time4}");
top_answers.rename(columns={"ParentId": "Id", "Body": "TopAnswer-html"}, inplace=True)
time6 = time.time()
print(f"answer rename time ={time6-time5}");
top_answers.drop(columns=['Score'], inplace=True)
time7 = time.time()
print(f"answer drop time ={time7-time6}");
print('start to process questions...')
questions = modin_pd.read_csv(self.question_file, encoding="ISO-8859-1")
time8 = time.time()
print(f"read questions csv time ={time8-time7}");
questions.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate', 'Score'], inplace=True)
time9 = time.time()
print(f"questions drop time ={time9-time8}");
questions.rename(columns={"Title": "text", "Body": "Question-html"}, inplace=True)
time10 = time.time()
print(f"questions rename time ={time10-time9}");
qa = questions.merge(top_answers, on='Id')
time11 = time.time()
print(f"questions answer merge time ={time11-time10}");
qa.rename(columns={'Id':'question_id'}, inplace=True)
time12 = time.time()
print(f"qa drop time ={time12-time11}");
# now cleanup if any
print('Dupplicate entries: {}'.format(qa.duplicated(subset=['text']).sum()))
#qa.drop_duplicates(inplace=True)
qa.drop_duplicates(subset=['text'], inplace=True)
time13 = time.time()
print(f"qa drop duplicates time ={time13-time12}");
# convert html to plain text
print('convert html to plain text...')
qa['question-body'] = qa['Question-html'].apply(lambda x: BeautifulSoup(x).get_text())
qa['answer'] = qa['TopAnswer-html'].apply(lambda x: BeautifulSoup(x).get_text())
qa.drop(columns=['Question-html', 'TopAnswer-html'], inplace=True)
time14 = time.time()
print(f"convert html time ={time14-time13}");
dataset = ray.data.from_modin(qa)
dataset = dataset.map_batches(_generate_documents)
return dataset