Skip to content

Commit c757d5d

Browse files
committed
feat: cache-busting plot images
1 parent 927e760 commit c757d5d

2 files changed

Lines changed: 38 additions & 36 deletions

File tree

docs/index.html

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -103863,8 +103863,8 @@ <h2 id="Statistics-Plots">Statistics Plots</h2>
103863103863
<h4 id="nodes">
103864103864
<a href="#nodes">nodes</a>
103865103865
</h4>
103866-
<a href="./plots/nodes.png" target="_blank">
103867-
<img src="./plots/nodes.png" alt="nodes Plot">
103866+
<a href="./plots/nodes.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103867+
<img src="./plots/nodes.png?version=cc-main-2025-feb-mar-apr" alt="nodes Plot">
103868103868
</a>
103869103869
<p>The total number of unique nodes (e.g., domains, hosts, or pages) in the graph. Each node represents an entity in the web graph.</p>
103870103870
</div>
@@ -103873,8 +103873,8 @@ <h4 id="nodes">
103873103873
<h4 id="arcs">
103874103874
<a href="#arcs">arcs</a>
103875103875
</h4>
103876-
<a href="./plots/arcs.png" target="_blank">
103877-
<img src="./plots/arcs.png" alt="arcs Plot">
103876+
<a href="./plots/arcs.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103877+
<img src="./plots/arcs.png?version=cc-main-2025-feb-mar-apr" alt="arcs Plot">
103878103878
</a>
103879103879
<p>The total number of directed edges (or arcs) in the graph, representing links between nodes.</p>
103880103880
</div>
@@ -103883,8 +103883,8 @@ <h4 id="arcs">
103883103883
<h4 id="successoravggap">
103884103884
<a href="#successoravggap">successoravggap</a>
103885103885
</h4>
103886-
<a href="./plots/successoravggap.png" target="_blank">
103887-
<img src="./plots/successoravggap.png" alt="successoravggap Plot">
103886+
<a href="./plots/successoravggap.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103887+
<img src="./plots/successoravggap.png?version=cc-main-2025-feb-mar-apr" alt="successoravggap Plot">
103888103888
</a>
103889103889
<p>The average gap between successive nodes in the adjacency list of the graph. This reflects the ordering and clustering of nodes in the graph.</p>
103890103890
</div>
@@ -103893,8 +103893,8 @@ <h4 id="successoravggap">
103893103893
<h4 id="avglocality">
103894103894
<a href="#avglocality">avglocality</a>
103895103895
</h4>
103896-
<a href="./plots/avglocality.png" target="_blank">
103897-
<img src="./plots/avglocality.png" alt="avglocality Plot">
103896+
<a href="./plots/avglocality.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103897+
<img src="./plots/avglocality.png?version=cc-main-2025-feb-mar-apr" alt="avglocality Plot">
103898103898
</a>
103899103899
<p>A measure of the locality of edges in the graph, indicating how closely related the linked nodes are in terms of graph structure.</p>
103900103900
</div>
@@ -103903,8 +103903,8 @@ <h4 id="avglocality">
103903103903
<h4 id="maxoutdegree">
103904103904
<a href="#maxoutdegree">maxoutdegree</a>
103905103905
</h4>
103906-
<a href="./plots/maxoutdegree.png" target="_blank">
103907-
<img src="./plots/maxoutdegree.png" alt="maxoutdegree Plot">
103906+
<a href="./plots/maxoutdegree.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103907+
<img src="./plots/maxoutdegree.png?version=cc-main-2025-feb-mar-apr" alt="maxoutdegree Plot">
103908103908
</a>
103909103909
<p>The highest number of outgoing edges (links) from a single node in the graph. This identifies the most connected node in terms of outlinks.</p>
103910103910
</div>
@@ -103913,8 +103913,8 @@ <h4 id="maxoutdegree">
103913103913
<h4 id="dangling">
103914103914
<a href="#dangling">dangling</a>
103915103915
</h4>
103916-
<a href="./plots/dangling.png" target="_blank">
103917-
<img src="./plots/dangling.png" alt="dangling Plot">
103916+
<a href="./plots/dangling.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103917+
<img src="./plots/dangling.png?version=cc-main-2025-feb-mar-apr" alt="dangling Plot">
103918103918
</a>
103919103919
<p>The total number of dangling nodes in the graph, which are nodes (vertices) with zero outgoing edges (arcs).</p>
103920103920
</div>
@@ -103923,8 +103923,8 @@ <h4 id="dangling">
103923103923
<h4 id="percdangling">
103924103924
<a href="#percdangling">percdangling</a>
103925103925
</h4>
103926-
<a href="./plots/percdangling.png" target="_blank">
103927-
<img src="./plots/percdangling.png" alt="percdangling Plot">
103926+
<a href="./plots/percdangling.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103927+
<img src="./plots/percdangling.png?version=cc-main-2025-feb-mar-apr" alt="percdangling Plot">
103928103928
</a>
103929103929
<p>The percentage of nodes in the graph that are dangling nodes (see above).</p>
103930103930
</div>
@@ -103933,8 +103933,8 @@ <h4 id="percdangling">
103933103933
<h4 id="avgoutdegree">
103934103934
<a href="#avgoutdegree">avgoutdegree</a>
103935103935
</h4>
103936-
<a href="./plots/avgoutdegree.png" target="_blank">
103937-
<img src="./plots/avgoutdegree.png" alt="avgoutdegree Plot">
103936+
<a href="./plots/avgoutdegree.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103937+
<img src="./plots/avgoutdegree.png?version=cc-main-2025-feb-mar-apr" alt="avgoutdegree Plot">
103938103938
</a>
103939103939
<p>The average number of outgoing edges per node. This provides an overview of the graph's overall connectivity.</p>
103940103940
</div>
@@ -103943,8 +103943,8 @@ <h4 id="avgoutdegree">
103943103943
<h4 id="successoravglogdelta">
103944103944
<a href="#successoravglogdelta">successoravglogdelta</a>
103945103945
</h4>
103946-
<a href="./plots/successoravglogdelta.png" target="_blank">
103947-
<img src="./plots/successoravglogdelta.png" alt="successoravglogdelta Plot">
103946+
<a href="./plots/successoravglogdelta.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103947+
<img src="./plots/successoravglogdelta.png?version=cc-main-2025-feb-mar-apr" alt="successoravglogdelta Plot">
103948103948
</a>
103949103949
<p>The average logarithmic difference between successive node IDs in the adjacency list. This reflects the dispersion of node IDs in the graph structure.</p>
103950103950
</div>
@@ -103953,8 +103953,8 @@ <h4 id="successoravglogdelta">
103953103953
<h4 id="maxindegree">
103954103954
<a href="#maxindegree">maxindegree</a>
103955103955
</h4>
103956-
<a href="./plots/maxindegree.png" target="_blank">
103957-
<img src="./plots/maxindegree.png" alt="maxindegree Plot">
103956+
<a href="./plots/maxindegree.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103957+
<img src="./plots/maxindegree.png?version=cc-main-2025-feb-mar-apr" alt="maxindegree Plot">
103958103958
</a>
103959103959
<p>The highest number of incoming edges (links) to a single node in the graph. It identifies the most referenced or linked-to node.</p>
103960103960
</div>
@@ -103963,8 +103963,8 @@ <h4 id="maxindegree">
103963103963
<h4 id="avgindegree">
103964103964
<a href="#avgindegree">avgindegree</a>
103965103965
</h4>
103966-
<a href="./plots/avgindegree.png" target="_blank">
103967-
<img src="./plots/avgindegree.png" alt="avgindegree Plot">
103966+
<a href="./plots/avgindegree.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103967+
<img src="./plots/avgindegree.png?version=cc-main-2025-feb-mar-apr" alt="avgindegree Plot">
103968103968
</a>
103969103969
<p>The average number of incoming edges per node. This is equal to avgoutdegree in a balanced directed graph.</p>
103970103970
</div>
@@ -103973,8 +103973,8 @@ <h4 id="avgindegree">
103973103973
<h4 id="sccs">
103974103974
<a href="#sccs">sccs</a>
103975103975
</h4>
103976-
<a href="./plots/sccs.png" target="_blank">
103977-
<img src="./plots/sccs.png" alt="sccs Plot">
103976+
<a href="./plots/sccs.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103977+
<img src="./plots/sccs.png?version=cc-main-2025-feb-mar-apr" alt="sccs Plot">
103978103978
</a>
103979103979
<p>The total number of strongly connected components (SCCs) in the graph. SCCs are subgraphs in which every node is reachable from every other node within the subgraph.</p>
103980103980
</div>
@@ -103983,8 +103983,8 @@ <h4 id="sccs">
103983103983
<h4 id="maxsccsize">
103984103984
<a href="#maxsccsize">maxsccsize</a>
103985103985
</h4>
103986-
<a href="./plots/maxsccsize.png" target="_blank">
103987-
<img src="./plots/maxsccsize.png" alt="maxsccsize Plot">
103986+
<a href="./plots/maxsccsize.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103987+
<img src="./plots/maxsccsize.png?version=cc-main-2025-feb-mar-apr" alt="maxsccsize Plot">
103988103988
</a>
103989103989
<p>The size (number of nodes) of the largest strongly connected component (SCC) in the graph. This indicates the largest cluster of nodes that are mutually reachable.</p>
103990103990
</div>
@@ -103993,8 +103993,8 @@ <h4 id="maxsccsize">
103993103993
<h4 id="percmaxscc">
103994103994
<a href="#percmaxscc">percmaxscc</a>
103995103995
</h4>
103996-
<a href="./plots/percmaxscc.png" target="_blank">
103997-
<img src="./plots/percmaxscc.png" alt="percmaxscc Plot">
103996+
<a href="./plots/percmaxscc.png?version=cc-main-2025-feb-mar-apr" target="_blank">
103997+
<img src="./plots/percmaxscc.png?version=cc-main-2025-feb-mar-apr" alt="percmaxscc Plot">
103998103998
</a>
103999103999
<p>The percentage of nodes in the graph that belong to the largest strongly connected component (SCC). It shows how dominant the largest SCC is in the overall graph.</p>
104000104000
</div>
@@ -104003,8 +104003,8 @@ <h4 id="percmaxscc">
104003104003
<h4 id="percminscc">
104004104004
<a href="#percminscc">percminscc</a>
104005104005
</h4>
104006-
<a href="./plots/percminscc.png" target="_blank">
104007-
<img src="./plots/percminscc.png" alt="percminscc Plot">
104006+
<a href="./plots/percminscc.png?version=cc-main-2025-feb-mar-apr" target="_blank">
104007+
<img src="./plots/percminscc.png?version=cc-main-2025-feb-mar-apr" alt="percminscc Plot">
104008104008
</a>
104009104009
<p>The percentage of nodes in the graph that belong to the smallest strongly connected components (SCCs) (typically isolated nodes or trivial SCCs). This indicates the prevalence of disconnected or minimally connected components.</p>
104010104010
</div>

src/build_webpage.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def has_zero_signal(series):
4848
return series.nunique() <= 1
4949

5050

51-
def generate_plots(data):
51+
def generate_plots(data, latest_release):
5252
excluded = [
5353
'maxindegreenode',
5454
'maxoutdegreenode',
@@ -66,9 +66,10 @@ def generate_plots(data):
6666
if col not in ["release", "source"] and not has_comma_separated_values(data[col]) and not has_zero_signal(data[col]) and col not in excluded:
6767
file_name = f"{col}.png"
6868
file_path = f"../docs/plots/{file_name}"
69+
html_file_name = f"{file_name}?version={latest_release}"
6970
if args.no_plots and os.path.exists(file_path):
7071
progress_bar.set_description(f"Using existing plot: {file_name}")
71-
plot_files.append((col, file_name))
72+
plot_files.append((col, html_file_name))
7273
else:
7374
progress_bar.set_description(f"Generating plot: {file_name}")
7475
plot = (
@@ -138,12 +139,12 @@ def embed_file(file_path):
138139
categories=combined_data['release'].unique()
139140
)
140141

141-
plot_files = generate_plots(combined_data)
142-
143142
last_updated = datetime.datetime.now().strftime("%Y-%m-%d")
144143
latest_release = combined_data['release'].iloc[-1]
145144
latest_release_url = f"https://data.commoncrawl.org/projects/hyperlinkgraph/{latest_release}/index.html"
146145

146+
plot_files = generate_plots(combined_data, latest_release)
147+
147148
html_content = """
148149
<!DOCTYPE html>
149150
<html lang="en">
@@ -322,15 +323,16 @@ def embed_file(file_path):
322323
for col in descriptions.keys():
323324
file_name = f"{col}.png"
324325
file_path = f"../docs/plots/{file_name}"
326+
html_file_name = f"{file_name}?version={latest_release}"
325327
description = descriptions.get(col, "No description available.")
326328
if os.path.exists(file_path):
327329
html_content += f"""
328330
<div class="chart-container">
329331
<h4 id="{col}">
330332
<a href="#{col}">{col}</a>
331333
</h4>
332-
<a href="./plots/{file_name}" target="_blank">
333-
<img src="./plots/{file_name}" alt="{col} Plot">
334+
<a href="./plots/{html_file_name}" target="_blank">
335+
<img src="./plots/{html_file_name}" alt="{col} Plot">
334336
</a>
335337
<p>{description}</p>
336338
</div>

0 commit comments

Comments
 (0)