|
| 1 | +# Download helpers |
| 2 | + |
| 3 | + |
| 4 | +<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! --> |
| 5 | + |
| 6 | +``` python |
| 7 | +from IPython.display import Markdown,HTML |
| 8 | +from fastcore.test import * |
| 9 | +``` |
| 10 | + |
| 11 | +------------------------------------------------------------------------ |
| 12 | + |
| 13 | +<a |
| 14 | +href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L14" |
| 15 | +target="_blank" style="float:right; font-size:smaller">source</a> |
| 16 | + |
| 17 | +### clean_md |
| 18 | + |
| 19 | +``` python |
| 20 | + |
| 21 | +def clean_md( |
| 22 | + text, rm_comments:bool=True, rm_details:bool=True |
| 23 | +): |
| 24 | + |
| 25 | +``` |
| 26 | + |
| 27 | +*Remove comments and `<details>` sections from `text`* |
| 28 | + |
| 29 | +------------------------------------------------------------------------ |
| 30 | + |
| 31 | +<a |
| 32 | +href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L22" |
| 33 | +target="_blank" style="float:right; font-size:smaller">source</a> |
| 34 | + |
| 35 | +### read_md |
| 36 | + |
| 37 | +``` python |
| 38 | + |
| 39 | +def read_md( |
| 40 | + url, rm_comments:bool=True, rm_details:bool=True, params:QueryParamTypes | None=None, |
| 41 | + headers:HeaderTypes | None=None, cookies:CookieTypes | None=None, auth:AuthTypes | None=None, |
| 42 | + proxy:ProxyTypes | None=None, follow_redirects:bool=False, verify:ssl.SSLContext | str | bool=True, |
| 43 | + timeout:TimeoutTypes=Timeout(timeout=5.0), trust_env:bool=True |
| 44 | +): |
| 45 | + |
| 46 | +``` |
| 47 | + |
| 48 | +*Read text from `url` and clean with `clean_docs`* |
| 49 | + |
| 50 | +``` python |
| 51 | +mdurl = 'https://claudette.answer.ai/index.html.md' |
| 52 | +md = read_md(mdurl) |
| 53 | +# Markdown(md) |
| 54 | +``` |
| 55 | + |
| 56 | +------------------------------------------------------------------------ |
| 57 | + |
| 58 | +<a |
| 59 | +href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L27" |
| 60 | +target="_blank" style="float:right; font-size:smaller">source</a> |
| 61 | + |
| 62 | +### html2md |
| 63 | + |
| 64 | +``` python |
| 65 | + |
| 66 | +def html2md( |
| 67 | + s:str, ignore_links:bool=True |
| 68 | +): |
| 69 | + |
| 70 | +``` |
| 71 | + |
| 72 | +*Convert `s` from HTML to markdown* |
| 73 | + |
| 74 | +------------------------------------------------------------------------ |
| 75 | + |
| 76 | +<a |
| 77 | +href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L37" |
| 78 | +target="_blank" style="float:right; font-size:smaller">source</a> |
| 79 | + |
| 80 | +### read_html |
| 81 | + |
| 82 | +``` python |
| 83 | + |
| 84 | +def read_html( |
| 85 | + url, # URL to read |
| 86 | + sel:NoneType=None, # Read only outerHTML of CSS selector `sel` |
| 87 | + rm_comments:bool=True, # Removes HTML comments |
| 88 | + rm_details:bool=True, # Removes `<details>` tags |
| 89 | + multi:bool=False, # Get all matches to `sel` or first one |
| 90 | + wrap_tag:NoneType=None, # If multi, each selection wrapped with <wrap_tag>content</wrap_tag> |
| 91 | + ignore_links:bool=True |
| 92 | +): |
| 93 | + |
| 94 | +``` |
| 95 | + |
| 96 | +*Get `url`, optionally selecting CSS selector `sel`, and convert to |
| 97 | +clean markdown* |
| 98 | + |
| 99 | +``` python |
| 100 | +# test single class selector |
| 101 | +listings = read_html('https://www.answer.ai/', sel='.listing-description') |
| 102 | +assert len(listings) < 500 |
| 103 | + |
| 104 | +# Test multi class selector |
| 105 | +listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True) |
| 106 | +assert len(listings) > 1000 # returns more than single so selecting multi |
| 107 | + |
| 108 | +# Test multi_wrap_tag |
| 109 | +listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, wrap_tag='document') |
| 110 | +assert '<document>' in listings and '</document>' in listings |
| 111 | +``` |
| 112 | + |
| 113 | +``` python |
| 114 | +read_html('https://www.answer.ai/', sel='.listing-description', ignore_links=False) |
| 115 | +``` |
| 116 | + |
| 117 | + '[ How I created a book chapter from video transcripts with SolveIt ](./posts/2025-10-13-video-to-doc.html)\n\n' |
| 118 | + |
| 119 | +``` python |
| 120 | +# test tag css selectors |
| 121 | +assert len(read_html('https://www.answer.ai/', sel='div.listing-description', multi=True)) > 1000 |
| 122 | +assert len(read_html('https://www.answer.ai/', sel='div', multi=True)) > 1000 |
| 123 | +``` |
| 124 | + |
| 125 | +``` python |
| 126 | +htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/' |
| 127 | +hmd = read_html(htmlurl) |
| 128 | +assert len(hmd) > 100 |
| 129 | +# Markdown(hmd) |
| 130 | +``` |
| 131 | + |
| 132 | +------------------------------------------------------------------------ |
| 133 | + |
| 134 | +<a |
| 135 | +href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L59" |
| 136 | +target="_blank" style="float:right; font-size:smaller">source</a> |
| 137 | + |
| 138 | +### get_llmstxt |
| 139 | + |
| 140 | +``` python |
| 141 | + |
| 142 | +def get_llmstxt( |
| 143 | + url, optional:bool=False, n_workers:NoneType=None |
| 144 | +): |
| 145 | + |
| 146 | +``` |
| 147 | + |
| 148 | +*Get llms.txt file from and expand it with `llms_txt.create_ctx()`* |
| 149 | + |
| 150 | +``` python |
| 151 | +# print(get_llmstxt('https://llmstxt.org/llms.txt')) |
| 152 | +``` |
| 153 | + |
| 154 | +------------------------------------------------------------------------ |
| 155 | + |
| 156 | +<a |
| 157 | +href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L68" |
| 158 | +target="_blank" style="float:right; font-size:smaller">source</a> |
| 159 | + |
| 160 | +### split_url |
| 161 | + |
| 162 | +``` python |
| 163 | + |
| 164 | +def split_url( |
| 165 | + url |
| 166 | +): |
| 167 | + |
| 168 | +``` |
| 169 | + |
| 170 | +*Split `url` into base, path, and file name, normalising name to ‘/’ if |
| 171 | +empty* |
| 172 | + |
| 173 | +``` python |
| 174 | +urls = ('https://claudette.answer.ai/path/', 'https://claudette.answer.ai/', 'https://llmstxt.org', 'https://llmstxt.org/') |
| 175 | + |
| 176 | +[split_url(o) for o in urls] |
| 177 | +``` |
| 178 | + |
| 179 | + [('https://claudette.answer.ai', '', '/path'), |
| 180 | + ('https://claudette.answer.ai', '/', ''), |
| 181 | + ('https://llmstxt.org', '/', ''), |
| 182 | + ('https://llmstxt.org', '/', '')] |
| 183 | + |
| 184 | +------------------------------------------------------------------------ |
| 185 | + |
| 186 | +<a |
| 187 | +href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L84" |
| 188 | +target="_blank" style="float:right; font-size:smaller">source</a> |
| 189 | + |
| 190 | +### find_docs |
| 191 | + |
| 192 | +``` python |
| 193 | + |
| 194 | +def find_docs( |
| 195 | + url |
| 196 | +): |
| 197 | + |
| 198 | +``` |
| 199 | + |
| 200 | +*If available, return LLM-friendly llms.txt context or markdown file |
| 201 | +location from `url`* |
| 202 | + |
| 203 | +``` python |
| 204 | +fl_url = 'https://answerdotai.github.io/fastlite' |
| 205 | +``` |
| 206 | + |
| 207 | +``` python |
| 208 | +find_docs(fl_url) |
| 209 | +``` |
| 210 | + |
| 211 | + 'https://answerdotai.github.io/fastlite/llms.txt' |
| 212 | + |
| 213 | +``` python |
| 214 | +for o in urls: print(find_docs(o)) |
| 215 | +``` |
| 216 | + |
| 217 | + https://claudette.answer.ai/llms.txt |
| 218 | + https://claudette.answer.ai/llms.txt |
| 219 | + https://llmstxt.org/llms.txt |
| 220 | + https://llmstxt.org/llms.txt |
| 221 | + |
| 222 | +``` python |
| 223 | +suffixes = ["/", "/tmp", "/tmp/tmp/"] |
| 224 | +for suff in suffixes: |
| 225 | + for o in urls: test_eq(find_docs(o), find_docs(o+suff)) |
| 226 | + |
| 227 | +test_eq(find_docs("https://github.com"), "https://github.com/llms.txt") |
| 228 | +test_eq(find_docs("https://github.com/AnswerDotAI"), "https://github.com/llms.txt") |
| 229 | +test_eq(find_docs("https://github.com/AnswerDotAI/"), "https://github.com/llms.txt") |
| 230 | +``` |
| 231 | + |
| 232 | +------------------------------------------------------------------------ |
| 233 | + |
| 234 | +<a |
| 235 | +href="https://github.com/AnswerDotAI/toolslm/blob/main/toolslm/download.py#L104" |
| 236 | +target="_blank" style="float:right; font-size:smaller">source</a> |
| 237 | + |
| 238 | +### read_docs |
| 239 | + |
| 240 | +``` python |
| 241 | + |
| 242 | +def read_docs( |
| 243 | + url, optional:bool=False, n_workers:NoneType=None, rm_comments:bool=True, rm_details:bool=True |
| 244 | +): |
| 245 | + |
| 246 | +``` |
| 247 | + |
| 248 | +*If available, return LLM-friendly llms.txt context or markdown file |
| 249 | +response for `url`* |
0 commit comments