From 751ea5828d9600ef110bbd791a2282ec50e23fb2 Mon Sep 17 00:00:00 2001 From: "Arik Jones (aider)" Date: Sat, 21 Sep 2024 10:55:01 -0500 Subject: [PATCH] refactor: update ScrapeSites to handle base_url and allowed_paths --- internal/scraper/scraper.go | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 0a96dd6..c766ba2 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -72,7 +72,10 @@ func ScrapeSites(config Config) (map[string]string, error) { wg.Add(1) go func(site SiteConfig) { defer wg.Done() - scrapeSite(site, config, results, limiter) + for _, path := range site.AllowedPaths { + fullURL := site.BaseURL + path + scrapeSingleURL(fullURL, site, config, results, limiter) + } }(site) } @@ -93,6 +96,31 @@ func ScrapeSites(config Config) (map[string]string, error) { return scrapedContent, nil } +func scrapeSingleURL(url string, site SiteConfig, config Config, results chan<- struct { + url string + content string + err error +}, limiter *rate.Limiter) { + // Wait for rate limiter before making the request + err := limiter.Wait(context.Background()) + if err != nil { + results <- struct { + url string + content string + err error + }{url, "", fmt.Errorf("rate limiter error: %v", err)} + return + } + + cssLocator, excludeSelectors := getOverrides(url, site) + content, err := scrapeURL(url, cssLocator, excludeSelectors) + results <- struct { + url string + content string + err error + }{url, content, err} +} + func scrapeSite(site SiteConfig, config Config, results chan<- struct { url string content string