From d0ee666b079a9f3443b85b98178decd7a741458b Mon Sep 17 00:00:00 2001
From: "Arik Jones (aider)" <github@tnypxl.com>
Date: Sat, 14 Sep 2024 15:20:15 -0500
Subject: [PATCH] refactor: Modify scraper to capture only the main content

---
 cmd/web.go                  | 13 ++-----------
 internal/scraper/scraper.go | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cmd/web.go b/cmd/web.go
index 8714eac..6784871 100644
--- a/cmd/web.go
+++ b/cmd/web.go
@@ -127,17 +127,8 @@ func extractAndConvertContent(urlStr string) (string, error) {
 		return "", fmt.Errorf("error fetching webpage content: %v", err)
 	}
 
-	if scraperConfig.CSSLocator != "" {
-		content, err = scraper.ExtractContentWithCSS(content, scraperConfig.CSSLocator)
-		if err != nil {
-			return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
-		}
-	} else if xpathSelector != "" {
-		content, err = scraper.ExtractContentWithXPath(content, xpathSelector)
-		if err != nil {
-			return "", fmt.Errorf("error extracting content with XPath selector: %v", err)
-		}
-	}
+	// The content is already extracted using the main element,
+	// so we don't need to use ExtractContentWithCSS or ExtractContentWithXPath here
 
 	// Create a new converter
 	converter := md.NewConverter("", true, nil)
diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go
index 6299e5a..bb5281a 100644
--- a/internal/scraper/scraper.go
+++ b/internal/scraper/scraper.go
@@ -100,11 +100,20 @@ func FetchWebpageContent(urlStr string) (string, error) {
 		return "", fmt.Errorf("error waiting for body: %v", err)
 	}
 
-	log.Println("Getting page content")
-	content, err := page.Content()
+	log.Println("Getting main content")
+	content, err := page.InnerHTML("main")
 	if err != nil {
-		log.Printf("Error getting page content: %v\n", err)
-		return "", fmt.Errorf("could not get page content: %v", err)
+		log.Printf("Error getting main content: %v\n", err)
+		return "", fmt.Errorf("could not get main content: %v", err)
+	}
+
+	if content == "" {
+		log.Println("Main content is empty, falling back to body content")
+		content, err = page.InnerHTML("body")
+		if err != nil {
+			log.Printf("Error getting body content: %v\n", err)
+			return "", fmt.Errorf("could not get body content: %v", err)
+		}
 	}
 
 	log.Printf("Successfully fetched webpage content (length: %d)\n", len(content))