fix: update scrapeSingleURL calls to include visited map and currentDepth for thread safety and correct functionality

2025-12-16 07:23:17 +00:00 · 2024-09-30 14:08:16 -05:00
parent ee1561c502
commit 54c3776baf
1 changed files with 11 additions and 4 deletions
--- a/internal/scraper/scraper.go
+++ b/internal/scraper/scraper.go
@@ -73,16 +73,20 @@ func ScrapeSites(config Config) (map[string]string, error) {
 	var wg sync.WaitGroup
 	totalURLs := 0
 	var mu sync.Mutex
 	for _, site := range config.Sites {
 		logger.Printf("Processing site: %s\n", site.BaseURL)
 		wg.Add(1)
 		go func(site SiteConfig) {
 			defer wg.Done()
 			visited := make(map[string]bool)
 			for _, path := range site.AllowedPaths {
 				fullURL := site.BaseURL + path
 				mu.Lock()
 				totalURLs++
 				mu.Unlock()
 				logger.Printf("Queueing URL for scraping: %s\n", fullURL)
-				scrapeSingleURL(fullURL, site, results, limiter)
+				scrapeSingleURL(fullURL, site, results, limiter, visited, 0)
 			}
 		}(site)
 	}
@@ -113,12 +117,16 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
 	url     string
 	content string
 	err     error
-}, limiter *rate.Limiter, visited map[string]bool, currentDepth int,
+}, limiter *rate.Limiter, visited map[string]bool, currentDepth int) {
 ) {
 	if site.MaxDepth > 0 && currentDepth > site.MaxDepth {
 		return
 	}
 	if visited[url] {
 		return
 	}
 	visited[url] = true
 	logger.Printf("Starting to scrape URL: %s\n", url)
 	// Wait for rate limiter before making the request
@@ -164,7 +172,6 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
 				if exists {
 					resolvedURL := resolveURL(href, url)
 					if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] {
 						visited[resolvedURL] = true
 						go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1)
 					}
 				}