From 54c3776baf0bc92a632308f926fd048c1d7de5ab Mon Sep 17 00:00:00 2001 From: "Arik Jones (aider)" Date: Mon, 30 Sep 2024 14:08:16 -0500 Subject: [PATCH] fix: update scrapeSingleURL calls to include visited map and currentDepth for thread safety and correct functionality --- internal/scraper/scraper.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index c509345..7aaafbd 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -73,16 +73,20 @@ func ScrapeSites(config Config) (map[string]string, error) { var wg sync.WaitGroup totalURLs := 0 + var mu sync.Mutex for _, site := range config.Sites { logger.Printf("Processing site: %s\n", site.BaseURL) wg.Add(1) go func(site SiteConfig) { defer wg.Done() + visited := make(map[string]bool) for _, path := range site.AllowedPaths { fullURL := site.BaseURL + path + mu.Lock() totalURLs++ + mu.Unlock() logger.Printf("Queueing URL for scraping: %s\n", fullURL) - scrapeSingleURL(fullURL, site, results, limiter) + scrapeSingleURL(fullURL, site, results, limiter, visited, 0) } }(site) } @@ -113,12 +117,16 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { url string content string err error -}, limiter *rate.Limiter, visited map[string]bool, currentDepth int, -) { +}, limiter *rate.Limiter, visited map[string]bool, currentDepth int) { if site.MaxDepth > 0 && currentDepth > site.MaxDepth { return } + if visited[url] { + return + } + visited[url] = true + logger.Printf("Starting to scrape URL: %s\n", url) // Wait for rate limiter before making the request @@ -164,7 +172,6 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { if exists { resolvedURL := resolveURL(href, url) if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] { - visited[resolvedURL] = true go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1) } }