-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_data.py
27 lines (20 loc) · 977 Bytes
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.html import UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def initialize_splitter(chunk_size, chunk_overlap):
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size = chunk_size,
chunk_overlap = chunk_overlap,
length_function = len,
is_separator_regex = False,
)
return text_splitter
def load_split_html_file(html_file, text_splitter):
loader = UnstructuredHTMLLoader(html_file)
data = loader.load_and_split(text_splitter)
return data
def load_split_pdf_file(pdf_file, text_splitter):
loader = PyPDFLoader(pdf_file)
data = loader.load_and_split(text_splitter)
return data