fix: Improve page content extraction in scraper

This commit is contained in:
Arik Jones
2024-09-14 19:59:01 -05:00
committed by Arik Jones (aider)
parent 3378402fb9
commit 01d6b2f54f

View File

@@ -100,15 +100,15 @@ func FetchWebpageContent(urlStr string) (string, error) {
return "", fmt.Errorf("error waiting for body: %v", err)
}
log.Println("Getting main content")
content, err := page.InnerHTML("main")
log.Println("Getting page content")
content, err := page.Content()
if err != nil {
log.Printf("Error getting main content: %v\n", err)
return "", fmt.Errorf("could not get main content: %v", err)
log.Printf("Error getting page content: %v\n", err)
return "", fmt.Errorf("could not get page content: %v", err)
}
if content == "" {
log.Println("Main content is empty, falling back to body content")
log.Println(" content is empty, falling back to body content")
content, err = page.InnerHTML("body")
if err != nil {
log.Printf("Error getting body content: %v\n", err)
@@ -129,17 +129,17 @@ func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
return "", fmt.Errorf("error parsing HTML: %v", err)
}
var selection *goquery.Selection
selection := doc.Find("body")
if config.CSSLocator != "" {
log.Printf("Using CSS locator: %s\n", config.CSSLocator)
selection = doc.Find(config.CSSLocator)
if selection.Length() == 0 {
tempSelection := doc.Find(config.CSSLocator)
if tempSelection.Length() > 0 {
selection = tempSelection
} else {
log.Printf("Warning: No content found with CSS locator: %s. Falling back to body content.\n", config.CSSLocator)
selection = doc.Find("body")
}
} else {
log.Println("No CSS locator provided, processing entire body")
selection = doc.Find("body")
}
if selection.Length() == 0 {