@@ -13,6 +13,8 @@ func StripHTML(s string) string {
1313 s = RemoveTagWithContent (s , "head" )
1414
1515 // Replace block-level elements with newlines before stripping tags
16+ // Note: table cell elements (table, td, th, tbody, thead, tfoot) are NOT included
17+ // because they're typically used for layout; tr is included to separate rows
1618 blockTags := []string {"br" , "p" , "div" , "tr" , "li" , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" }
1719 for _ , tag := range blockTags {
1820 // Handle <br>, <br/>, <br />
@@ -52,17 +54,25 @@ func StripHTML(s string) string {
5254 text = strings .ReplaceAll (text , " " , " " )
5355 }
5456
55- // Collapse multiple newlines
56- for strings .Contains (text , "\n \n \n " ) {
57- text = strings .ReplaceAll (text , "\n \n \n " , "\n \n " )
58- }
59-
60- // Trim spaces from each line
57+ // Trim spaces from each line first
6158 lines := strings .Split (text , "\n " )
6259 for i , line := range lines {
6360 lines [i ] = strings .TrimSpace (line )
6461 }
65- text = strings .Join (lines , "\n " )
62+
63+ // Remove consecutive empty lines, keeping at most one blank line
64+ var cleanedLines []string
65+ prevEmpty := false
66+ for _ , line := range lines {
67+ isEmpty := line == ""
68+ if isEmpty && prevEmpty {
69+ continue // Skip consecutive empty lines
70+ }
71+ cleanedLines = append (cleanedLines , line )
72+ prevEmpty = isEmpty
73+ }
74+
75+ text = strings .Join (cleanedLines , "\n " )
6676
6777 // Remove leading/trailing empty lines
6878 return strings .TrimSpace (text )
0 commit comments