@@ -19,6 +19,10 @@ const TAB_WIDTH: usize = 8;
1919const NL : u8 = b'\n' ;
2020const CR : u8 = b'\r' ;
2121const TAB : u8 = b'\t' ;
22+ // Implementation threshold (8 KiB) to prevent unbounded buffer growth during streaming.
23+ // Chosen as a small, fixed cap: large enough to avoid excessive flushes, but
24+ // small enough to keep memory bounded when the input has no fold points.
25+ const STREAMING_FLUSH_THRESHOLD : usize = 8 * 1024 ;
2226
2327mod options {
2428 pub const BYTES : & str = "bytes" ;
@@ -288,6 +292,10 @@ fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize {
288292}
289293
290294fn emit_output < W : Write > ( ctx : & mut FoldContext < ' _ , W > ) -> UResult < ( ) > {
295+ // Emit one folded line:
296+ // - with `-s`, cut at the last remembered whitespace when possible
297+ // - otherwise, cut at the current buffer end
298+ // The remainder (if any) stays in the buffer for the next line.
291299 let consume = match * ctx. last_space {
292300 Some ( index) => index + 1 ,
293301 None => ctx. output . len ( ) ,
@@ -309,6 +317,7 @@ fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
309317 * ctx. col_count = compute_col_count ( ctx. output , ctx. mode ) ;
310318
311319 if ctx. spaces {
320+ // Rebase the remembered whitespace position into the remaining buffer.
312321 * ctx. last_space = last_space. and_then ( |idx| {
313322 if idx < consume {
314323 None
@@ -322,6 +331,36 @@ fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
322331 Ok ( ( ) )
323332}
324333
334+ fn maybe_flush_unbroken_output < W : Write > ( ctx : & mut FoldContext < ' _ , W > ) -> UResult < ( ) > {
335+ // In streaming mode without `-s`, avoid unbounded buffering by periodically
336+ // flushing long unbroken segments. With `-s` we must keep the buffer so we
337+ // can still break at the last whitespace boundary.
338+ if ctx. spaces || ctx. output . len ( ) < STREAMING_FLUSH_THRESHOLD {
339+ return Ok ( ( ) ) ;
340+ }
341+
342+ // Write raw bytes without inserting a newline; folding will continue
343+ // based on updated column tracking in the caller.
344+ ctx. writer . write_all ( ctx. output ) ?;
345+ ctx. output . clear ( ) ;
346+ Ok ( ( ) )
347+ }
348+
349+ fn push_byte < W : Write > ( ctx : & mut FoldContext < ' _ , W > , byte : u8 ) -> UResult < ( ) > {
350+ // Append a single byte to the buffer.
351+ ctx. output . push ( byte) ;
352+ maybe_flush_unbroken_output ( ctx)
353+ }
354+
355+ fn push_bytes < W : Write > ( ctx : & mut FoldContext < ' _ , W > , bytes : & [ u8 ] ) -> UResult < ( ) > {
356+ // Append a byte slice to the buffer and flush if it grows too large.
357+ if bytes. is_empty ( ) {
358+ return Ok ( ( ) ) ;
359+ }
360+ ctx. output . extend_from_slice ( bytes) ;
361+ maybe_flush_unbroken_output ( ctx)
362+ }
363+
325364fn process_ascii_line < W : Write > ( line : & [ u8 ] , ctx : & mut FoldContext < ' _ , W > ) -> UResult < ( ) > {
326365 let mut idx = 0 ;
327366 let len = line. len ( ) ;
@@ -331,15 +370,15 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR
331370 NL => {
332371 * ctx. last_space = None ;
333372 emit_output ( ctx) ?;
334- break ;
373+ idx += 1 ;
335374 }
336375 CR => {
337- ctx . output . push ( CR ) ;
376+ push_byte ( ctx , CR ) ? ;
338377 * ctx. col_count = 0 ;
339378 idx += 1 ;
340379 }
341380 0x08 => {
342- ctx . output . push ( 0x08 ) ;
381+ push_byte ( ctx , 0x08 ) ? ;
343382 * ctx. col_count = ctx. col_count . saturating_sub ( 1 ) ;
344383 idx += 1 ;
345384 }
@@ -358,16 +397,23 @@ fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UR
358397 } else {
359398 * ctx. last_space = None ;
360399 }
361- ctx . output . push ( TAB ) ;
400+ push_byte ( ctx , TAB ) ? ;
362401 idx += 1 ;
363402 }
364403 0x00 ..=0x07 | 0x0B ..=0x0C | 0x0E ..=0x1F | 0x7F => {
365- ctx . output . push ( line[ idx] ) ;
404+ push_byte ( ctx , line[ idx] ) ? ;
366405 if ctx. spaces && line[ idx] . is_ascii_whitespace ( ) && line[ idx] != CR {
367406 * ctx. last_space = Some ( ctx. output . len ( ) - 1 ) ;
368407 } else if !ctx. spaces {
369408 * ctx. last_space = None ;
370409 }
410+
411+ if ctx. mode == WidthMode :: Characters {
412+ * ctx. col_count = ctx. col_count . saturating_add ( 1 ) ;
413+ if * ctx. col_count >= ctx. width {
414+ emit_output ( ctx) ?;
415+ }
416+ }
371417 idx += 1 ;
372418 }
373419 _ => {
@@ -405,7 +451,7 @@ fn push_ascii_segment<W: Write>(segment: &[u8], ctx: &mut FoldContext<'_, W>) ->
405451 let take = remaining. len ( ) . min ( available) ;
406452 let base_len = ctx. output . len ( ) ;
407453
408- ctx . output . extend_from_slice ( & remaining[ ..take] ) ;
454+ push_bytes ( ctx , & remaining[ ..take] ) ? ;
409455 * ctx. col_count += take;
410456
411457 if ctx. spaces {
@@ -430,16 +476,26 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
430476 return process_ascii_line ( line. as_bytes ( ) , ctx) ;
431477 }
432478
479+ process_utf8_chars ( line, ctx)
480+ }
481+
482+ fn process_utf8_chars < W : Write > ( line : & str , ctx : & mut FoldContext < ' _ , W > ) -> UResult < ( ) > {
433483 let line_bytes = line. as_bytes ( ) ;
434484 let mut iter = line. char_indices ( ) . peekable ( ) ;
435485
436486 while let Some ( ( byte_idx, ch) ) = iter. next ( ) {
437- // Include combining characters with the base character
438- while let Some ( & ( _, next_ch) ) = iter. peek ( ) {
439- if unicode_width:: UnicodeWidthChar :: width ( next_ch) . unwrap_or ( 1 ) == 0 {
440- iter. next ( ) ;
441- } else {
442- break ;
487+ // Include combining characters with the base character when we are
488+ // measuring by display columns. In character-counting mode every
489+ // scalar value must advance the counter to match `chars().count()`
490+ // semantics (see `fold_characters_reference` in the tests), so we do
491+ // not coalesce zero-width scalars there.
492+ if ctx. mode == WidthMode :: Columns {
493+ while let Some ( & ( _, next_ch) ) = iter. peek ( ) {
494+ if unicode_width:: UnicodeWidthChar :: width ( next_ch) . unwrap_or ( 1 ) == 0 {
495+ iter. next ( ) ;
496+ } else {
497+ break ;
498+ }
443499 }
444500 }
445501
@@ -448,23 +504,21 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
448504 if ch == '\n' {
449505 * ctx. last_space = None ;
450506 emit_output ( ctx) ?;
451- break ;
507+ continue ;
452508 }
453509
454510 if * ctx. col_count >= ctx. width {
455511 emit_output ( ctx) ?;
456512 }
457513
458514 if ch == '\r' {
459- ctx. output
460- . extend_from_slice ( & line_bytes[ byte_idx..next_idx] ) ;
515+ push_bytes ( ctx, & line_bytes[ byte_idx..next_idx] ) ?;
461516 * ctx. col_count = 0 ;
462517 continue ;
463518 }
464519
465520 if ch == '\x08' {
466- ctx. output
467- . extend_from_slice ( & line_bytes[ byte_idx..next_idx] ) ;
521+ push_bytes ( ctx, & line_bytes[ byte_idx..next_idx] ) ?;
468522 * ctx. col_count = ctx. col_count . saturating_sub ( 1 ) ;
469523 continue ;
470524 }
@@ -484,8 +538,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
484538 } else {
485539 * ctx. last_space = None ;
486540 }
487- ctx. output
488- . extend_from_slice ( & line_bytes[ byte_idx..next_idx] ) ;
541+ push_bytes ( ctx, & line_bytes[ byte_idx..next_idx] ) ?;
489542 continue ;
490543 }
491544
@@ -506,8 +559,7 @@ fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> URes
506559 * ctx. last_space = Some ( ctx. output . len ( ) ) ;
507560 }
508561
509- ctx. output
510- . extend_from_slice ( & line_bytes[ byte_idx..next_idx] ) ;
562+ push_bytes ( ctx, & line_bytes[ byte_idx..next_idx] ) ?;
511563 * ctx. col_count = ctx. col_count . saturating_add ( added) ;
512564 }
513565
@@ -519,7 +571,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
519571 if byte == NL {
520572 * ctx. last_space = None ;
521573 emit_output ( ctx) ?;
522- break ;
574+ continue ;
523575 }
524576
525577 if * ctx. col_count >= ctx. width {
@@ -539,7 +591,7 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
539591 } else {
540592 None
541593 } ;
542- ctx . output . push ( byte) ;
594+ push_byte ( ctx , byte) ? ;
543595 continue ;
544596 }
545597 0x08 => * ctx. col_count = ctx. col_count . saturating_sub ( 1 ) ,
@@ -550,7 +602,46 @@ fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) ->
550602 _ => * ctx. col_count = ctx. col_count . saturating_add ( 1 ) ,
551603 }
552604
553- ctx. output . push ( byte) ;
605+ push_byte ( ctx, byte) ?;
606+ }
607+
608+ Ok ( ( ) )
609+ }
610+
611+ /// Process buffered bytes, emitting output for valid UTF-8 prefixes and
612+ /// deferring incomplete sequences until more input arrives.
613+ ///
614+ /// If the buffer contains invalid UTF-8, it is handled in non-UTF-8 mode and
615+ /// the buffer is fully consumed.
616+ fn process_pending_chunk < W : Write > (
617+ pending : & mut Vec < u8 > ,
618+ ctx : & mut FoldContext < ' _ , W > ,
619+ ) -> UResult < ( ) > {
620+ while !pending. is_empty ( ) {
621+ match std:: str:: from_utf8 ( pending) {
622+ Ok ( valid) => {
623+ process_utf8_line ( valid, ctx) ?;
624+ pending. clear ( ) ;
625+ break ;
626+ }
627+ Err ( err) => {
628+ if err. error_len ( ) . is_some ( ) {
629+ let res = process_non_utf8_line ( pending, ctx) ;
630+ pending. clear ( ) ;
631+ res?;
632+ break ;
633+ }
634+
635+ let valid_up_to = err. valid_up_to ( ) ;
636+ if valid_up_to == 0 {
637+ break ;
638+ }
639+
640+ let valid = std:: str:: from_utf8 ( & pending[ ..valid_up_to] ) . expect ( "valid prefix" ) ;
641+ process_utf8_line ( valid, ctx) ?;
642+ pending. drain ( ..valid_up_to) ;
643+ }
644+ }
554645 }
555646
556647 Ok ( ( ) )
@@ -572,20 +663,12 @@ fn fold_file<T: Read, W: Write>(
572663 mode : WidthMode ,
573664 writer : & mut W ,
574665) -> UResult < ( ) > {
575- let mut line = Vec :: new ( ) ;
576666 let mut output = Vec :: new ( ) ;
577667 let mut col_count = 0 ;
578668 let mut last_space = None ;
669+ let mut pending = Vec :: with_capacity ( 8 * 1024 ) ;
579670
580- loop {
581- if file
582- . read_until ( NL , & mut line)
583- . map_err_context ( || translate ! ( "fold-error-readline" ) ) ?
584- == 0
585- {
586- break ;
587- }
588-
671+ {
589672 let mut ctx = FoldContext {
590673 spaces,
591674 width,
@@ -596,17 +679,32 @@ fn fold_file<T: Read, W: Write>(
596679 last_space : & mut last_space,
597680 } ;
598681
599- match std:: str:: from_utf8 ( & line) {
600- Ok ( s) => process_utf8_line ( s, & mut ctx) ?,
601- Err ( _) => process_non_utf8_line ( & line, & mut ctx) ?,
682+ loop {
683+ let buffer = file
684+ . fill_buf ( )
685+ . map_err_context ( || translate ! ( "fold-error-readline" ) ) ?;
686+ if buffer. is_empty ( ) {
687+ break ;
688+ }
689+ pending. extend_from_slice ( buffer) ;
690+ let consumed = buffer. len ( ) ;
691+ file. consume ( consumed) ;
692+
693+ process_pending_chunk ( & mut pending, & mut ctx) ?;
602694 }
603695
604- line. clear ( ) ;
605- }
696+ if !pending. is_empty ( ) {
697+ match std:: str:: from_utf8 ( & pending) {
698+ Ok ( s) => process_utf8_line ( s, & mut ctx) ?,
699+ Err ( _) => process_non_utf8_line ( & pending, & mut ctx) ?,
700+ }
701+ pending. clear ( ) ;
702+ }
606703
607- if !output. is_empty ( ) {
608- writer. write_all ( & output) ?;
609- output. clear ( ) ;
704+ if !ctx. output . is_empty ( ) {
705+ ctx. writer . write_all ( ctx. output ) ?;
706+ ctx. output . clear ( ) ;
707+ }
610708 }
611709
612710 Ok ( ( ) )
0 commit comments