Skip to content

Commit c96826d

Browse files
committed
fix(messages): resolve missing new messages bug and cap batch density to prevent OOM
1 parent c8ad8f3 commit c96826d

1 file changed

Lines changed: 15 additions & 11 deletions

File tree

ofscraper/data/api/messages.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ def get_tasks(splitArrays, anchor_id, c, model_id, username, after):
192192
tasks = []
193193

194194
# Scenario 1: Empty DB, or just hunting for brand new messages
195-
if len(splitArrays) == 0:
195+
if not splitArrays:
196196
tasks.append(
197197
scrape_messages(
198198
c,
@@ -207,20 +207,24 @@ def get_tasks(splitArrays, anchor_id, c, model_id, username, after):
207207

208208
# Scenarios 2 & 3: Dynamic Chunking
209209
for i, chunk in enumerate(splitArrays):
210-
is_first_chunk = i == 0
211-
is_final_chunk = i == len(splitArrays) - 1
212-
213-
# The first chunk uses the teleport anchor.
214-
# Subsequent chunks use the ID of the OLDEST message in the previous chunk.
215-
start_id = (
216-
anchor_id if is_first_chunk else splitArrays[i - 1][-1].get("post_id")
217-
)
210+
211+
# --- 1. Define the Start ID ---
212+
if i == 0:
213+
start_id = None # Chunk 0: Teleport to the absolute newest message
214+
elif i == 1:
215+
start_id = anchor_id # Chunk 1: Pick up at the cached anchor
216+
else:
217+
# Chunk 2+: Pick up at the tail of the previous chunk
218+
start_id = splitArrays[i - 1][-1].get("post_id")
219+
220+
# --- 2. Define the Start Timestamp ---
218221
start_timestamp = (
219222
arrow.now().float_timestamp
220-
if is_first_chunk
223+
if i == 0
221224
else float(splitArrays[i - 1][-1].get("created_at"))
222225
)
223226

227+
# --- 3. Build the Task ---
224228
tasks.append(
225229
scrape_messages(
226230
c,
@@ -232,7 +236,7 @@ def get_tasks(splitArrays, anchor_id, c, model_id, username, after):
232236
{"post_id": ele.get("post_id"), "timestamp": ele.get("created_at")}
233237
for ele in chunk
234238
],
235-
is_last_chunk=is_final_chunk,
239+
is_last_chunk=(i == len(splitArrays) - 1),
236240
after=after,
237241
)
238242
)

0 commit comments

Comments
 (0)