refactor: Modify scraper to capture only the main content

This commit is contained in:
Arik Jones (aider)
2024-09-14 15:20:15 -05:00
parent bfd70fd786
commit d0ee666b07
2 changed files with 15 additions and 15 deletions

View File

@@ -127,17 +127,8 @@ func extractAndConvertContent(urlStr string) (string, error) {
return "", fmt.Errorf("error fetching webpage content: %v", err)
}
if scraperConfig.CSSLocator != "" {
content, err = scraper.ExtractContentWithCSS(content, scraperConfig.CSSLocator)
if err != nil {
return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
}
} else if xpathSelector != "" {
content, err = scraper.ExtractContentWithXPath(content, xpathSelector)
if err != nil {
return "", fmt.Errorf("error extracting content with XPath selector: %v", err)
}
}
// The content is already extracted using the main element,
// so we don't need to use ExtractContentWithCSS or ExtractContentWithXPath here
// Create a new converter
converter := md.NewConverter("", true, nil)

View File

@@ -100,11 +100,20 @@ func FetchWebpageContent(urlStr string) (string, error) {
return "", fmt.Errorf("error waiting for body: %v", err)
}
log.Println("Getting page content")
content, err := page.Content()
log.Println("Getting main content")
content, err := page.InnerHTML("main")
if err != nil {
log.Printf("Error getting page content: %v\n", err)
return "", fmt.Errorf("could not get page content: %v", err)
log.Printf("Error getting main content: %v\n", err)
return "", fmt.Errorf("could not get main content: %v", err)
}
if content == "" {
log.Println("Main content is empty, falling back to body content")
content, err = page.InnerHTML("body")
if err != nil {
log.Printf("Error getting body content: %v\n", err)
return "", fmt.Errorf("could not get body content: %v", err)
}
}
log.Printf("Successfully fetched webpage content (length: %d)\n", len(content))