Skip to content

Commit 2e143c6

Browse files
deploy: 5777bc5
0 parents  commit 2e143c6

38 files changed

Lines changed: 17891 additions & 0 deletions

.nojekyll

Whitespace-only changes.

CHANGELOG-commonmark.md

Lines changed: 408 additions & 0 deletions
Large diffs are not rendered by default.

CHANGELOG.html

Lines changed: 1253 additions & 0 deletions
Large diffs are not rendered by default.

download.html

Lines changed: 824 additions & 0 deletions
Large diffs are not rendered by default.

download.html.md

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
# Download helpers
2+
3+
4+
<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
5+
6+
``` python
7+
from IPython.display import Markdown,HTML
8+
from fastcore.test import *
9+
```
10+
11+
------------------------------------------------------------------------
12+
13+
<a
14+
href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L14"
15+
target="_blank" style="float:right; font-size:smaller">source</a>
16+
17+
### clean_md
18+
19+
``` python
20+
21+
def clean_md(
22+
text, rm_comments:bool=True, rm_details:bool=True
23+
):
24+
25+
```
26+
27+
*Remove comments and `<details>` sections from `text`*
28+
29+
------------------------------------------------------------------------
30+
31+
<a
32+
href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L22"
33+
target="_blank" style="float:right; font-size:smaller">source</a>
34+
35+
### read_md
36+
37+
``` python
38+
39+
def read_md(
40+
url, rm_comments:bool=True, rm_details:bool=True, params:QueryParamTypes | None=None,
41+
headers:HeaderTypes | None=None, cookies:CookieTypes | None=None, auth:AuthTypes | None=None,
42+
proxy:ProxyTypes | None=None, follow_redirects:bool=False, verify:ssl.SSLContext | str | bool=True,
43+
timeout:TimeoutTypes=Timeout(timeout=5.0), trust_env:bool=True
44+
):
45+
46+
```
47+
48+
*Read text from `url` and clean with `clean_docs`*
49+
50+
``` python
51+
mdurl = 'https://claudette.answer.ai/index.html.md'
52+
md = read_md(mdurl)
53+
# Markdown(md)
54+
```
55+
56+
------------------------------------------------------------------------
57+
58+
<a
59+
href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L27"
60+
target="_blank" style="float:right; font-size:smaller">source</a>
61+
62+
### html2md
63+
64+
``` python
65+
66+
def html2md(
67+
s:str, ignore_links:bool=True
68+
):
69+
70+
```
71+
72+
*Convert `s` from HTML to markdown*
73+
74+
------------------------------------------------------------------------
75+
76+
<a
77+
href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L37"
78+
target="_blank" style="float:right; font-size:smaller">source</a>
79+
80+
### read_html
81+
82+
``` python
83+
84+
def read_html(
85+
url, # URL to read
86+
sel:NoneType=None, # Read only outerHTML of CSS selector `sel`
87+
rm_comments:bool=True, # Removes HTML comments
88+
rm_details:bool=True, # Removes `<details>` tags
89+
multi:bool=False, # Get all matches to `sel` or first one
90+
wrap_tag:NoneType=None, # If multi, each selection wrapped with <wrap_tag>content</wrap_tag>
91+
ignore_links:bool=True
92+
):
93+
94+
```
95+
96+
*Get `url`, optionally selecting CSS selector `sel`, and convert to
97+
clean markdown*
98+
99+
``` python
100+
# test single class selector
101+
listings = read_html('https://www.answer.ai/', sel='.listing-description')
102+
assert len(listings) < 500
103+
104+
# Test multi class selector
105+
listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True)
106+
assert len(listings) > 1000 # returns more than single so selecting multi
107+
108+
# Test multi_wrap_tag
109+
listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, wrap_tag='document')
110+
assert '<document>' in listings and '</document>' in listings
111+
```
112+
113+
``` python
114+
read_html('https://www.answer.ai/', sel='.listing-description', ignore_links=False)
115+
```
116+
117+
'[ How I created a book chapter from video transcripts with SolveIt ](./posts/2025-10-13-video-to-doc.html)\n\n'
118+
119+
``` python
120+
# test tag css selectors
121+
assert len(read_html('https://www.answer.ai/', sel='div.listing-description', multi=True)) > 1000
122+
assert len(read_html('https://www.answer.ai/', sel='div', multi=True)) > 1000
123+
```
124+
125+
``` python
126+
htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/'
127+
hmd = read_html(htmlurl)
128+
assert len(hmd) > 100
129+
# Markdown(hmd)
130+
```
131+
132+
------------------------------------------------------------------------
133+
134+
<a
135+
href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L59"
136+
target="_blank" style="float:right; font-size:smaller">source</a>
137+
138+
### get_llmstxt
139+
140+
``` python
141+
142+
def get_llmstxt(
143+
url, optional:bool=False, n_workers:NoneType=None
144+
):
145+
146+
```
147+
148+
*Get llms.txt file from and expand it with `llms_txt.create_ctx()`*
149+
150+
``` python
151+
# print(get_llmstxt('https://llmstxt.org/llms.txt'))
152+
```
153+
154+
------------------------------------------------------------------------
155+
156+
<a
157+
href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L68"
158+
target="_blank" style="float:right; font-size:smaller">source</a>
159+
160+
### split_url
161+
162+
``` python
163+
164+
def split_url(
165+
url
166+
):
167+
168+
```
169+
170+
*Split `url` into base, path, and file name, normalising name to ‘/’ if
171+
empty*
172+
173+
``` python
174+
urls = ('https://claudette.answer.ai/path/', 'https://claudette.answer.ai/', 'https://llmstxt.org', 'https://llmstxt.org/')
175+
176+
[split_url(o) for o in urls]
177+
```
178+
179+
[('https://claudette.answer.ai', '', '/path'),
180+
('https://claudette.answer.ai', '/', ''),
181+
('https://llmstxt.org', '/', ''),
182+
('https://llmstxt.org', '/', '')]
183+
184+
------------------------------------------------------------------------
185+
186+
<a
187+
href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L84"
188+
target="_blank" style="float:right; font-size:smaller">source</a>
189+
190+
### find_docs
191+
192+
``` python
193+
194+
def find_docs(
195+
url
196+
):
197+
198+
```
199+
200+
*If available, return LLM-friendly llms.txt context or markdown file
201+
location from `url`*
202+
203+
``` python
204+
fl_url = 'https://answerdotai.github.io/fastlite'
205+
```
206+
207+
``` python
208+
find_docs(fl_url)
209+
```
210+
211+
'https://answerdotai.github.io/fastlite/llms.txt'
212+
213+
``` python
214+
for o in urls: print(find_docs(o))
215+
```
216+
217+
https://claudette.answer.ai/llms.txt
218+
https://claudette.answer.ai/llms.txt
219+
https://llmstxt.org/llms.txt
220+
https://llmstxt.org/llms.txt
221+
222+
``` python
223+
suffixes = ["/", "/tmp", "/tmp/tmp/"]
224+
for suff in suffixes:
225+
for o in urls: test_eq(find_docs(o), find_docs(o+suff))
226+
227+
test_eq(find_docs("https://github.com"), "https://github.com/llms.txt")
228+
test_eq(find_docs("https://github.com/AnswerDotAI"), "https://github.com/llms.txt")
229+
test_eq(find_docs("https://github.com/AnswerDotAI/"), "https://github.com/llms.txt")
230+
```
231+
232+
------------------------------------------------------------------------
233+
234+
<a
235+
href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L104"
236+
target="_blank" style="float:right; font-size:smaller">source</a>
237+
238+
### read_docs
239+
240+
``` python
241+
242+
def read_docs(
243+
url, optional:bool=False, n_workers:NoneType=None, rm_comments:bool=True, rm_details:bool=True
244+
):
245+
246+
```
247+
248+
*If available, return LLM-friendly llms.txt context or markdown file
249+
response for `url`*

0 commit comments

Comments
 (0)