From ee1561c502ec3e9a8425dd4f9e2f86c261d4fae8 Mon Sep 17 00:00:00 2001 From: "Arik Jones (aider)" Date: Mon, 30 Sep 2024 14:05:10 -0500 Subject: [PATCH] feat: add LinksContainerSelector to SiteConfig and enhance scraping logic with depth control and link extraction --- internal/config/config.go | 1 + internal/scraper/scraper.go | 92 +++++++++++++++++++++++++++++-------- 2 files changed, 75 insertions(+), 18 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index 0042396..923c62c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -30,6 +30,7 @@ type SiteConfig struct { ExcludePaths []string `yaml:"exclude_paths"` OutputAlias string `yaml:"output_alias"` PathOverrides []PathOverride `yaml:"path_overrides"` + LinksContainerSelector string `yaml:"links_container_selector"` } type PathOverride struct { diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 54ab0a8..c509345 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -50,6 +50,7 @@ type SiteConfig struct { ExcludePaths []string OutputAlias string PathOverrides []PathOverride + LinksContainerSelector string } // PathOverride holds path-specific overrides @@ -112,8 +113,12 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { url string content string err error -}, limiter *rate.Limiter, +}, limiter *rate.Limiter, visited map[string]bool, currentDepth int, ) { + if site.MaxDepth > 0 && currentDepth > site.MaxDepth { + return + } + logger.Printf("Starting to scrape URL: %s\n", url) // Wait for rate limiter before making the request @@ -128,13 +133,9 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { return } - cssLocator, excludeSelectors := getOverrides(url, site) - logger.Printf("Using CSS locator for %s: %s\n", url, cssLocator) - logger.Printf("Exclude selectors for %s: %v\n", url, excludeSelectors) - - content, err := scrapeURL(url, cssLocator, excludeSelectors) + content, err := FetchWebpageContent(url) if err != nil { - logger.Printf("Error scraping %s: %v\n", url, err) + logger.Printf("Error fetching content for %s: %v\n", url, err) results <- struct { url string content string @@ -143,17 +144,61 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { return } - if content == "" { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) + if err != nil { + logger.Printf("Error parsing HTML for %s: %v\n", url, err) + results <- struct { + url string + content string + err error + }{url, "", fmt.Errorf("error parsing HTML: %v", err)} + return + } + + if site.LinksContainerSelector != "" { + logger.Printf("Processing links container for %s\n", url) + linkContainers := doc.Find(site.LinksContainerSelector) + linkContainers.Each(func(i int, container *goquery.Selection) { + container.Find("a[href]").Each(func(j int, link *goquery.Selection) { + href, exists := link.Attr("href") + if exists { + resolvedURL := resolveURL(href, url) + if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] { + visited[resolvedURL] = true + go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1) + } + } + }) + }) + return + } + + cssLocator, excludeSelectors := getOverrides(url, site) + logger.Printf("Using CSS locator for %s: %s\n", url, cssLocator) + logger.Printf("Exclude selectors for %s: %v\n", url, excludeSelectors) + + extractedContent, err := ExtractContentWithCSS(content, cssLocator, excludeSelectors) + if err != nil { + logger.Printf("Error extracting content for %s: %v\n", url, err) + results <- struct { + url string + content string + err error + }{url, "", err} + return + } + + if extractedContent == "" { logger.Printf("Warning: Empty content scraped from %s\n", url) } else { - logger.Printf("Successfully scraped content from %s (length: %d)\n", url, len(content)) + logger.Printf("Successfully scraped content from %s (length: %d)\n", url, len(extractedContent)) } results <- struct { url string content string err error - }{url, content, nil} + }{url, extractedContent, nil} } func scrapeSite(site SiteConfig, results chan<- struct { @@ -220,18 +265,29 @@ func isAllowedURL(urlStr string, site SiteConfig) bool { } path := parsedURL.Path - for _, allowedPath := range site.AllowedPaths { - if strings.HasPrefix(path, allowedPath) { - for _, excludePath := range site.ExcludePaths { - if strings.HasPrefix(path, excludePath) { - return false - } + + // Check if the URL is within allowed paths + if len(site.AllowedPaths) > 0 { + allowed := false + for _, allowedPath := range site.AllowedPaths { + if strings.HasPrefix(path, allowedPath) { + allowed = true + break } - return true + } + if !allowed { + return false } } - return false + // Check if the URL is in excluded paths + for _, excludePath := range site.ExcludePaths { + if strings.HasPrefix(path, excludePath) { + return false + } + } + + return true } func getOverrides(urlStr string, site SiteConfig) (string, []string) {