From d0ee666b079a9f3443b85b98178decd7a741458b Mon Sep 17 00:00:00 2001 From: "Arik Jones (aider)" Date: Sat, 14 Sep 2024 15:20:15 -0500 Subject: [PATCH] refactor: Modify scraper to capture only the main content --- cmd/web.go | 13 ++----------- internal/scraper/scraper.go | 17 +++++++++++++---- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cmd/web.go b/cmd/web.go index 8714eac..6784871 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -127,17 +127,8 @@ func extractAndConvertContent(urlStr string) (string, error) { return "", fmt.Errorf("error fetching webpage content: %v", err) } - if scraperConfig.CSSLocator != "" { - content, err = scraper.ExtractContentWithCSS(content, scraperConfig.CSSLocator) - if err != nil { - return "", fmt.Errorf("error extracting content with CSS selector: %v", err) - } - } else if xpathSelector != "" { - content, err = scraper.ExtractContentWithXPath(content, xpathSelector) - if err != nil { - return "", fmt.Errorf("error extracting content with XPath selector: %v", err) - } - } + // The content is already extracted using the main element, + // so we don't need to use ExtractContentWithCSS or ExtractContentWithXPath here // Create a new converter converter := md.NewConverter("", true, nil) diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 6299e5a..bb5281a 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -100,11 +100,20 @@ func FetchWebpageContent(urlStr string) (string, error) { return "", fmt.Errorf("error waiting for body: %v", err) } - log.Println("Getting page content") - content, err := page.Content() + log.Println("Getting main content") + content, err := page.InnerHTML("main") if err != nil { - log.Printf("Error getting page content: %v\n", err) - return "", fmt.Errorf("could not get page content: %v", err) + log.Printf("Error getting main content: %v\n", err) + return "", fmt.Errorf("could not get main content: %v", err) + } + + if content == "" { + log.Println("Main content is empty, falling back to body content") + content, err = page.InnerHTML("body") + if err != nil { + log.Printf("Error getting body content: %v\n", err) + return "", fmt.Errorf("could not get body content: %v", err) + } } log.Printf("Successfully fetched webpage content (length: %d)\n", len(content))