mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-13 06:23:18 +00:00
fix: update scrapeSingleURL calls to include visited map and currentDepth for thread safety and correct functionality
This commit is contained in:
@@ -73,16 +73,20 @@ func ScrapeSites(config Config) (map[string]string, error) {
|
||||
|
||||
var wg sync.WaitGroup
|
||||
totalURLs := 0
|
||||
var mu sync.Mutex
|
||||
for _, site := range config.Sites {
|
||||
logger.Printf("Processing site: %s\n", site.BaseURL)
|
||||
wg.Add(1)
|
||||
go func(site SiteConfig) {
|
||||
defer wg.Done()
|
||||
visited := make(map[string]bool)
|
||||
for _, path := range site.AllowedPaths {
|
||||
fullURL := site.BaseURL + path
|
||||
mu.Lock()
|
||||
totalURLs++
|
||||
mu.Unlock()
|
||||
logger.Printf("Queueing URL for scraping: %s\n", fullURL)
|
||||
scrapeSingleURL(fullURL, site, results, limiter)
|
||||
scrapeSingleURL(fullURL, site, results, limiter, visited, 0)
|
||||
}
|
||||
}(site)
|
||||
}
|
||||
@@ -113,12 +117,16 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
|
||||
url string
|
||||
content string
|
||||
err error
|
||||
}, limiter *rate.Limiter, visited map[string]bool, currentDepth int,
|
||||
) {
|
||||
}, limiter *rate.Limiter, visited map[string]bool, currentDepth int) {
|
||||
if site.MaxDepth > 0 && currentDepth > site.MaxDepth {
|
||||
return
|
||||
}
|
||||
|
||||
if visited[url] {
|
||||
return
|
||||
}
|
||||
visited[url] = true
|
||||
|
||||
logger.Printf("Starting to scrape URL: %s\n", url)
|
||||
|
||||
// Wait for rate limiter before making the request
|
||||
@@ -164,7 +172,6 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
|
||||
if exists {
|
||||
resolvedURL := resolveURL(href, url)
|
||||
if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] {
|
||||
visited[resolvedURL] = true
|
||||
go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user