feat: Implement recursive web scraping and content extraction

2025-12-15 15:03:17 +00:00 · 2024-09-14 14:46:34 -05:00
parent cf99bd8bf1
commit 50c9e7898d
1 changed files with 46 additions and 9 deletions
--- a/cmd/web.go
+++ b/cmd/web.go
@@ -88,13 +88,44 @@ func runWeb(cmd *cobra.Command, args []string) error {
 }
 func generateDefaultFilename(urls []string) string {
-	// Simple implementation for now
+	timestamp := time.Now().Format("20060102-150405")
-	return "rollup-web-content.md"
+	return fmt.Sprintf("rollup-web-%s.md", timestamp)
 }
-func scrapeRecursively(url string, depth int) (string, error) {
+func scrapeRecursively(urlStr string, depth int) (string, error) {
-	// Simple implementation for now
+	visited := make(map[string]bool)
-	return extractAndConvertContent(url)
+	return scrapeURL(urlStr, depth, visited)
 }
 func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error) {
 	if depth < 0 || visited[urlStr] {
 		return "", nil
 	}
 	visited[urlStr] = true
 	content, err := extractAndConvertContent(urlStr)
 	if err != nil {
 		return "", err
 	}
 	if depth > 0 {
 		links, err := scraper.ExtractLinks(urlStr)
 		if err != nil {
 			return content, fmt.Errorf("error extracting links: %v", err)
 		}
 		for _, link := range links {
 			subContent, err := scrapeURL(link, depth-1, visited)
 			if err != nil {
 				fmt.Printf("Warning: Error scraping %s: %v\n", link, err)
 				continue
 			}
 			content += "\n\n---\n\n" + subContent
 		}
 	}
 	return content, nil
 }
 func extractAndConvertContent(urlStr string) (string, error) {
@@ -103,10 +134,16 @@ func extractAndConvertContent(urlStr string) (string, error) {
 		return "", fmt.Errorf("error fetching webpage content: %v", err)
 	}
-	if cssSelector != "" || xpathSelector != "" {
+	if cssSelector != "" {
-		// TODO: Implement content extraction with CSS or XPath selector
+		content, err = scraper.ExtractContentWithCSS(content, cssSelector)
-		// For now, we'll just use the full content
+		if err != nil {
-		fmt.Println("Warning: CSS and XPath selectors are not yet implemented")
+			return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
 		}
 	} else if xpathSelector != "" {
 		content, err = scraper.ExtractContentWithXPath(content, xpathSelector)
 		if err != nil {
 			return "", fmt.Errorf("error extracting content with XPath selector: %v", err)
 		}
 	}
 	// Create a new converter