refactor: update ScrapeSites to handle base_url and allowed_paths

This commit is contained in:
Arik Jones (aider)
2024-09-21 10:55:01 -05:00
parent 7e4f4cdbb6
commit 751ea5828d

View File

@@ -72,7 +72,10 @@ func ScrapeSites(config Config) (map[string]string, error) {
wg.Add(1)
go func(site SiteConfig) {
defer wg.Done()
scrapeSite(site, config, results, limiter)
for _, path := range site.AllowedPaths {
fullURL := site.BaseURL + path
scrapeSingleURL(fullURL, site, config, results, limiter)
}
}(site)
}
@@ -93,6 +96,31 @@ func ScrapeSites(config Config) (map[string]string, error) {
return scrapedContent, nil
}
func scrapeSingleURL(url string, site SiteConfig, config Config, results chan<- struct {
url string
content string
err error
}, limiter *rate.Limiter) {
// Wait for rate limiter before making the request
err := limiter.Wait(context.Background())
if err != nil {
results <- struct {
url string
content string
err error
}{url, "", fmt.Errorf("rate limiter error: %v", err)}
return
}
cssLocator, excludeSelectors := getOverrides(url, site)
content, err := scrapeURL(url, cssLocator, excludeSelectors)
results <- struct {
url string
content string
err error
}{url, content, err}
}
func scrapeSite(site SiteConfig, config Config, results chan<- struct {
url string
content string