-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathloader.py
More file actions
75 lines (62 loc) · 2.81 KB
/
loader.py
File metadata and controls
75 lines (62 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
import os
import logging
handler = logging.FileHandler("logs/loader.log")
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
handler.setFormatter(formatter)
logger = logging.getLogger('loader')
logger.setLevel(logging.INFO)
logger.addHandler(handler)
def find_markdown_files(directory,nso_ver=None):
logger.info("Directory Path: "+os.path.abspath(directory))
markdown_files = {}
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".md") :
markdown_path=os.path.join(root, file)
path_level=markdown_path.split("/")
if len(path_level)>3 and "README" not in markdown_path: #ignore README.md files
if nso_ver and nso_ver !="latest":
url="https://nso-docs.cisco.com/guides/"+nso_ver+"/"+markdown_path.split("resource/nso-gitbook/")[1]
else:
url="https://nso-docs.cisco.com/guides/"+markdown_path.split("resource/nso-gitbook/")[1]
#url=markdown_path.replace("resource/nso-gitbook/","https://nso-docs.cisco.com/guides/")
url=url.replace(".md","")
logger.info(f"Found markdown file: {markdown_path} at {path_level[2]} level {len(path_level)} / url: {url}")
markdown_files[url]=os.path.join(root, file)
return markdown_files
def article_loader(markdown_path):
with open(markdown_path) as f:
data=f.read()
return data
def markdown_splitter(markdown_document,url,markdown_path,nso_ver):
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
for doc in md_header_splits:
for key,item in doc.metadata.items():
if "<a href" in item:
doc.metadata[key]=item.split("<a href")[0].strip()
doc.metadata["url"]=url
doc.metadata["path"]=markdown_path
doc.metadata['NSO Version']=nso_ver
#print(doc.metadata)
return md_header_splits
if __name__=="__main__":
markdown_path = "resource/nso-gitbook/"
path=find_markdown_files(markdown_path)
#print(path)
for url,markdown_path in path.items():
print(f"Processing markdown file: {markdown_path}")
markdown_document=article_loader(markdown_path)
splitter=markdown_splitter(markdown_document,url)
#print(splitter)
print(f"Total document chunks: {len(splitter)}")
# for doc in splitter:
# print("-----document chunk------")
# print(doc)