fix: update scrapeSingleURL calls to include visited map and currentDepth for thread safety and correct functionality

This commit is contained in:
Arik Jones (aider)
2024-09-30 14:08:16 -05:00
parent ee1561c502
commit 54c3776baf

View File

@@ -73,16 +73,20 @@ func ScrapeSites(config Config) (map[string]string, error) {
var wg sync.WaitGroup var wg sync.WaitGroup
totalURLs := 0 totalURLs := 0
var mu sync.Mutex
for _, site := range config.Sites { for _, site := range config.Sites {
logger.Printf("Processing site: %s\n", site.BaseURL) logger.Printf("Processing site: %s\n", site.BaseURL)
wg.Add(1) wg.Add(1)
go func(site SiteConfig) { go func(site SiteConfig) {
defer wg.Done() defer wg.Done()
visited := make(map[string]bool)
for _, path := range site.AllowedPaths { for _, path := range site.AllowedPaths {
fullURL := site.BaseURL + path fullURL := site.BaseURL + path
mu.Lock()
totalURLs++ totalURLs++
mu.Unlock()
logger.Printf("Queueing URL for scraping: %s\n", fullURL) logger.Printf("Queueing URL for scraping: %s\n", fullURL)
scrapeSingleURL(fullURL, site, results, limiter) scrapeSingleURL(fullURL, site, results, limiter, visited, 0)
} }
}(site) }(site)
} }
@@ -113,12 +117,16 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
url string url string
content string content string
err error err error
}, limiter *rate.Limiter, visited map[string]bool, currentDepth int, }, limiter *rate.Limiter, visited map[string]bool, currentDepth int) {
) {
if site.MaxDepth > 0 && currentDepth > site.MaxDepth { if site.MaxDepth > 0 && currentDepth > site.MaxDepth {
return return
} }
if visited[url] {
return
}
visited[url] = true
logger.Printf("Starting to scrape URL: %s\n", url) logger.Printf("Starting to scrape URL: %s\n", url)
// Wait for rate limiter before making the request // Wait for rate limiter before making the request
@@ -164,7 +172,6 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
if exists { if exists {
resolvedURL := resolveURL(href, url) resolvedURL := resolveURL(href, url)
if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] { if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] {
visited[resolvedURL] = true
go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1) go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1)
} }
} }