From 01d6b2f54f33b9f3afa2984a4c8438939c8abba1 Mon Sep 17 00:00:00 2001 From: Arik Jones Date: Sat, 14 Sep 2024 19:59:01 -0500 Subject: [PATCH] fix: Improve page content extraction in scraper --- internal/scraper/scraper.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 3d1b912..ab6e48f 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -100,15 +100,15 @@ func FetchWebpageContent(urlStr string) (string, error) { return "", fmt.Errorf("error waiting for body: %v", err) } - log.Println("Getting main content") - content, err := page.InnerHTML("main") + log.Println("Getting page content") + content, err := page.Content() if err != nil { - log.Printf("Error getting main content: %v\n", err) - return "", fmt.Errorf("could not get main content: %v", err) + log.Printf("Error getting page content: %v\n", err) + return "", fmt.Errorf("could not get page content: %v", err) } if content == "" { - log.Println("Main content is empty, falling back to body content") + log.Println(" content is empty, falling back to body content") content, err = page.InnerHTML("body") if err != nil { log.Printf("Error getting body content: %v\n", err) @@ -129,17 +129,17 @@ func ProcessHTMLContent(htmlContent string, config Config) (string, error) { return "", fmt.Errorf("error parsing HTML: %v", err) } - var selection *goquery.Selection + selection := doc.Find("body") if config.CSSLocator != "" { log.Printf("Using CSS locator: %s\n", config.CSSLocator) - selection = doc.Find(config.CSSLocator) - if selection.Length() == 0 { + tempSelection := doc.Find(config.CSSLocator) + if tempSelection.Length() > 0 { + selection = tempSelection + } else { log.Printf("Warning: No content found with CSS locator: %s. Falling back to body content.\n", config.CSSLocator) - selection = doc.Find("body") } } else { log.Println("No CSS locator provided, processing entire body") - selection = doc.Find("body") } if selection.Length() == 0 {