feat: Implement recursive web scraping with configurable depth and content extraction

This commit is contained in:
Arik Jones (aider)
2024-09-14 14:41:54 -05:00
parent 0163c4e504
commit 514bcacd8a

View File

@@ -1,24 +1,25 @@
package cmd package cmd
// TODO: Implement web scraping (DO NOT EDIT) import (
"fmt"
"net/url"
"os"
"strings"
"time"
// import ( "github.com/JohannesKaufmann/html-to-markdown"
// "fmt" "github.com/spf13/cobra"
// "net/url" "github.com/tnypxl/rollup/internal/config"
// "os" "github.com/tnypxl/rollup/internal/scraper"
// "strings" )
// "time"
// var (
// "github.com/JohannesKaufmann/html-to-markdown" urls []string
// "github.com/spf13/cobra" outputFile string
// "github.com/tnypxl/rollup/internal/config" depth int
// "github.com/tnypxl/rollup/internal/scraper" cssSelector string
// ) xpathSelector string
// )
// var (
// urls []string
// outputFile string
// )
// //
// var webCmd = &cobra.Command{ // var webCmd = &cobra.Command{
// Use: "web", // Use: "web",
@@ -154,3 +155,33 @@ package cmd
// //
// return header + markdown + "\n\n", nil // return header + markdown + "\n\n", nil
// } // }
func scrapeRecursively(urlStr string, currentDepth int) (string, error) {
if currentDepth < 0 {
return "", nil
}
content, err := extractAndConvertContent(urlStr)
if err != nil {
return "", err
}
if currentDepth == 0 {
return content, nil
}
links, err := scraper.ExtractLinks(urlStr)
if err != nil {
return content, err
}
for _, link := range links {
subContent, err := scrapeRecursively(link, currentDepth-1)
if err != nil {
fmt.Printf("Warning: Error scraping %s: %v\n", link, err)
continue
}
content += "\n\n---\n\n" + subContent
}
return content, nil
}