diff --git a/cmd/notcrawl/main.go b/cmd/notcrawl/main.go index bd4596d..b4ee9b7 100644 --- a/cmd/notcrawl/main.go +++ b/cmd/notcrawl/main.go @@ -860,7 +860,7 @@ func blockPreviewLines(blocks []store.Block, maxLines int) []string { } lines := make([]string, 0, maxLines) for _, block := range blocks { - text := notiontext.CleanLegacyArtifacts(block.Text) + text := compactPreviewNoise(notiontext.CleanLegacyArtifacts(block.Text)) if text == "" { continue } @@ -891,6 +891,21 @@ func blockPreviewLines(blocks []store.Block, maxLines int) []string { return lines } +func compactPreviewNoise(s string) string { + s = strings.ReplaceAll(s, "linked pagess", "linked pages") + for strings.Contains(s, "linked page, linked page") || + strings.Contains(s, "linked pages, linked page") || + strings.Contains(s, "linked page, linked pages") || + strings.Contains(s, "linked pages, linked pages") { + s = strings.ReplaceAll(s, "linked pages, linked page", "linked pages") + s = strings.ReplaceAll(s, "linked page, linked pages", "linked pages") + s = strings.ReplaceAll(s, "linked pages, linked pages", "linked pages") + s = strings.ReplaceAll(s, "linked page, linked page", "linked pages") + s = strings.ReplaceAll(s, "linked pagess", "linked pages") + } + return s +} + func collectionPreview(collection store.Collection, space, parent string) string { var lines []string if space != "" { diff --git a/cmd/notcrawl/main_test.go b/cmd/notcrawl/main_test.go index 4022f96..5be34c5 100644 --- a/cmd/notcrawl/main_test.go +++ b/cmd/notcrawl/main_test.go @@ -194,6 +194,16 @@ func TestBlockPreviewCleansLegacyNotionMarkers(t *testing.T) { } } +func TestBlockPreviewCompactsRepeatedLinkedPages(t *testing.T) { + got := blockPreview([]store.Block{{ + Type: "paragraph", + Text: "linked page, linked page, linked page Add details", + }}, tuiPagePreviewMax) + if got != "linked pages Add details" { + t.Fatalf("got %q", got) + } +} + func TestPagePreviewIncludesComments(t *testing.T) { got := pagePreview( []store.Block{{Type: "paragraph", Text: "status update"}}, diff --git a/internal/notiontext/text.go b/internal/notiontext/text.go index be53eb3..1719f41 100644 --- a/internal/notiontext/text.go +++ b/internal/notiontext/text.go @@ -18,7 +18,7 @@ var ( legacyBareMentionRE = regexp.MustCompile(`‣\s+[0-9a-fA-F]{8}-[0-9a-fA-F-]{8,}`) spaceBeforePunctuationRE = regexp.MustCompile(`\s+([,.;:])`) repeatedCommaRE = regexp.MustCompile(`(?:,\s*){2,}`) - repeatedLinkedPageRE = regexp.MustCompile(`(?:linked page,\s*){2,}linked page`) + repeatedLinkedPageRE = regexp.MustCompile(`linked page\b(?:,\s*linked page\b)+`) ) func Normalize(s string) string { @@ -35,6 +35,7 @@ func CleanLegacyArtifacts(s string) string { s = Normalize(s) s = repeatedCommaRE.ReplaceAllString(s, ", ") s = repeatedLinkedPageRE.ReplaceAllString(s, "linked pages") + s = strings.ReplaceAll(s, "linked pagess", "linked pages") s = spaceBeforePunctuationRE.ReplaceAllString(s, "$1") s = strings.ReplaceAll(s, " and, ", ", ") return Normalize(s)