-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
30 lines (25 loc) · 831 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""This module contains utility functions for the project"""
import mmh3
from haystack import Document
def get_unique_docs(dataset, unique_docs: set):
"""Get unique documents from dataset
Args:
dataset: list of dictionaries
Returns:
docs: list of haystack.Document
"""
docs = list()
for doc in dataset:
if doc["context"] is not None and doc["context_id"] not in unique_docs:
unique_docs.add(doc["context_id"])
document = Document(
content=doc["context"],
meta={
"title": doc["context_title"],
"context_id": doc["context_id"],
"url": doc["url"],
"source": "QASports",
},
)
docs.append(document)
return docs