diff --git a/cmd/web.go b/cmd/web.go index 8714eac..6784871 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -127,17 +127,8 @@ func extractAndConvertContent(urlStr string) (string, error) { return "", fmt.Errorf("error fetching webpage content: %v", err) } - if scraperConfig.CSSLocator != "" { - content, err = scraper.ExtractContentWithCSS(content, scraperConfig.CSSLocator) - if err != nil { - return "", fmt.Errorf("error extracting content with CSS selector: %v", err) - } - } else if xpathSelector != "" { - content, err = scraper.ExtractContentWithXPath(content, xpathSelector) - if err != nil { - return "", fmt.Errorf("error extracting content with XPath selector: %v", err) - } - } + // The content is already extracted using the main element, + // so we don't need to use ExtractContentWithCSS or ExtractContentWithXPath here // Create a new converter converter := md.NewConverter("", true, nil) diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 6299e5a..bb5281a 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -100,11 +100,20 @@ func FetchWebpageContent(urlStr string) (string, error) { return "", fmt.Errorf("error waiting for body: %v", err) } - log.Println("Getting page content") - content, err := page.Content() + log.Println("Getting main content") + content, err := page.InnerHTML("main") if err != nil { - log.Printf("Error getting page content: %v\n", err) - return "", fmt.Errorf("could not get page content: %v", err) + log.Printf("Error getting main content: %v\n", err) + return "", fmt.Errorf("could not get main content: %v", err) + } + + if content == "" { + log.Println("Main content is empty, falling back to body content") + content, err = page.InnerHTML("body") + if err != nil { + log.Printf("Error getting body content: %v\n", err) + return "", fmt.Errorf("could not get body content: %v", err) + } } log.Printf("Successfully fetched webpage content (length: %d)\n", len(content))