mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 15:03:17 +00:00
fix: Improve page content extraction in scraper
This commit is contained in:
committed by
Arik Jones (aider)
parent
3378402fb9
commit
01d6b2f54f
@@ -100,15 +100,15 @@ func FetchWebpageContent(urlStr string) (string, error) {
|
|||||||
return "", fmt.Errorf("error waiting for body: %v", err)
|
return "", fmt.Errorf("error waiting for body: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Println("Getting main content")
|
log.Println("Getting page content")
|
||||||
content, err := page.InnerHTML("main")
|
content, err := page.Content()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Error getting main content: %v\n", err)
|
log.Printf("Error getting page content: %v\n", err)
|
||||||
return "", fmt.Errorf("could not get main content: %v", err)
|
return "", fmt.Errorf("could not get page content: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if content == "" {
|
if content == "" {
|
||||||
log.Println("Main content is empty, falling back to body content")
|
log.Println(" content is empty, falling back to body content")
|
||||||
content, err = page.InnerHTML("body")
|
content, err = page.InnerHTML("body")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Error getting body content: %v\n", err)
|
log.Printf("Error getting body content: %v\n", err)
|
||||||
@@ -129,17 +129,17 @@ func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
|
|||||||
return "", fmt.Errorf("error parsing HTML: %v", err)
|
return "", fmt.Errorf("error parsing HTML: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
var selection *goquery.Selection
|
selection := doc.Find("body")
|
||||||
if config.CSSLocator != "" {
|
if config.CSSLocator != "" {
|
||||||
log.Printf("Using CSS locator: %s\n", config.CSSLocator)
|
log.Printf("Using CSS locator: %s\n", config.CSSLocator)
|
||||||
selection = doc.Find(config.CSSLocator)
|
tempSelection := doc.Find(config.CSSLocator)
|
||||||
if selection.Length() == 0 {
|
if tempSelection.Length() > 0 {
|
||||||
|
selection = tempSelection
|
||||||
|
} else {
|
||||||
log.Printf("Warning: No content found with CSS locator: %s. Falling back to body content.\n", config.CSSLocator)
|
log.Printf("Warning: No content found with CSS locator: %s. Falling back to body content.\n", config.CSSLocator)
|
||||||
selection = doc.Find("body")
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
log.Println("No CSS locator provided, processing entire body")
|
log.Println("No CSS locator provided, processing entire body")
|
||||||
selection = doc.Find("body")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if selection.Length() == 0 {
|
if selection.Length() == 0 {
|
||||||
|
|||||||
Reference in New Issue
Block a user