mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 23:13:22 +00:00
fix: update scrapeSingleURL calls to include visited map and currentDepth for thread safety and correct functionality
This commit is contained in:
@@ -73,16 +73,20 @@ func ScrapeSites(config Config) (map[string]string, error) {
|
|||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
totalURLs := 0
|
totalURLs := 0
|
||||||
|
var mu sync.Mutex
|
||||||
for _, site := range config.Sites {
|
for _, site := range config.Sites {
|
||||||
logger.Printf("Processing site: %s\n", site.BaseURL)
|
logger.Printf("Processing site: %s\n", site.BaseURL)
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func(site SiteConfig) {
|
go func(site SiteConfig) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
visited := make(map[string]bool)
|
||||||
for _, path := range site.AllowedPaths {
|
for _, path := range site.AllowedPaths {
|
||||||
fullURL := site.BaseURL + path
|
fullURL := site.BaseURL + path
|
||||||
|
mu.Lock()
|
||||||
totalURLs++
|
totalURLs++
|
||||||
|
mu.Unlock()
|
||||||
logger.Printf("Queueing URL for scraping: %s\n", fullURL)
|
logger.Printf("Queueing URL for scraping: %s\n", fullURL)
|
||||||
scrapeSingleURL(fullURL, site, results, limiter)
|
scrapeSingleURL(fullURL, site, results, limiter, visited, 0)
|
||||||
}
|
}
|
||||||
}(site)
|
}(site)
|
||||||
}
|
}
|
||||||
@@ -113,12 +117,16 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
|
|||||||
url string
|
url string
|
||||||
content string
|
content string
|
||||||
err error
|
err error
|
||||||
}, limiter *rate.Limiter, visited map[string]bool, currentDepth int,
|
}, limiter *rate.Limiter, visited map[string]bool, currentDepth int) {
|
||||||
) {
|
|
||||||
if site.MaxDepth > 0 && currentDepth > site.MaxDepth {
|
if site.MaxDepth > 0 && currentDepth > site.MaxDepth {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if visited[url] {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
visited[url] = true
|
||||||
|
|
||||||
logger.Printf("Starting to scrape URL: %s\n", url)
|
logger.Printf("Starting to scrape URL: %s\n", url)
|
||||||
|
|
||||||
// Wait for rate limiter before making the request
|
// Wait for rate limiter before making the request
|
||||||
@@ -164,7 +172,6 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
|
|||||||
if exists {
|
if exists {
|
||||||
resolvedURL := resolveURL(href, url)
|
resolvedURL := resolveURL(href, url)
|
||||||
if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] {
|
if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] {
|
||||||
visited[resolvedURL] = true
|
|
||||||
go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1)
|
go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user