mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 23:13:22 +00:00
feat: enhance logging in scraper for better debugging
This commit is contained in:
@@ -59,6 +59,7 @@ type PathOverride struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ScrapeSites(config Config) (map[string]string, error) {
|
func ScrapeSites(config Config) (map[string]string, error) {
|
||||||
|
logger.Println("Starting ScrapeSites function")
|
||||||
results := make(chan struct {
|
results := make(chan struct {
|
||||||
url string
|
url string
|
||||||
content string
|
content string
|
||||||
@@ -66,10 +67,12 @@ func ScrapeSites(config Config) (map[string]string, error) {
|
|||||||
})
|
})
|
||||||
|
|
||||||
limiter := rate.NewLimiter(rate.Limit(config.Scrape.RequestsPerSecond), config.Scrape.BurstLimit)
|
limiter := rate.NewLimiter(rate.Limit(config.Scrape.RequestsPerSecond), config.Scrape.BurstLimit)
|
||||||
|
logger.Printf("Rate limiter configured with %f requests per second and burst limit of %d\n", config.Scrape.RequestsPerSecond, config.Scrape.BurstLimit)
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
totalURLs := 0
|
totalURLs := 0
|
||||||
for _, site := range config.Sites {
|
for _, site := range config.Sites {
|
||||||
|
logger.Printf("Processing site: %s\n", site.BaseURL)
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func(site SiteConfig) {
|
go func(site SiteConfig) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
@@ -85,6 +88,7 @@ func ScrapeSites(config Config) (map[string]string, error) {
|
|||||||
go func() {
|
go func() {
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
close(results)
|
close(results)
|
||||||
|
logger.Println("All goroutines completed, results channel closed")
|
||||||
}()
|
}()
|
||||||
|
|
||||||
scrapedContent := make(map[string]string)
|
scrapedContent := make(map[string]string)
|
||||||
@@ -93,6 +97,7 @@ func ScrapeSites(config Config) (map[string]string, error) {
|
|||||||
logger.Printf("Error scraping %s: %v\n", result.url, result.err)
|
logger.Printf("Error scraping %s: %v\n", result.url, result.err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
logger.Printf("Successfully scraped content from %s (length: %d)\n", result.url, len(result.content))
|
||||||
scrapedContent[result.url] = result.content
|
scrapedContent[result.url] = result.content
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,7 +112,7 @@ func scrapeSingleURL(url string, site SiteConfig, config Config, results chan<-
|
|||||||
content string
|
content string
|
||||||
err error
|
err error
|
||||||
}, limiter *rate.Limiter) {
|
}, limiter *rate.Limiter) {
|
||||||
logger.Printf("Scraping URL: %s\n", url)
|
logger.Printf("Starting to scrape URL: %s\n", url)
|
||||||
|
|
||||||
// Wait for rate limiter before making the request
|
// Wait for rate limiter before making the request
|
||||||
err := limiter.Wait(context.Background())
|
err := limiter.Wait(context.Background())
|
||||||
@@ -122,8 +127,8 @@ func scrapeSingleURL(url string, site SiteConfig, config Config, results chan<-
|
|||||||
}
|
}
|
||||||
|
|
||||||
cssLocator, excludeSelectors := getOverrides(url, site)
|
cssLocator, excludeSelectors := getOverrides(url, site)
|
||||||
logger.Printf("Using CSS locator: %s\n", cssLocator)
|
logger.Printf("Using CSS locator for %s: %s\n", url, cssLocator)
|
||||||
logger.Printf("Exclude selectors: %v\n", excludeSelectors)
|
logger.Printf("Exclude selectors for %s: %v\n", url, excludeSelectors)
|
||||||
|
|
||||||
content, err := scrapeURL(url, cssLocator, excludeSelectors)
|
content, err := scrapeURL(url, cssLocator, excludeSelectors)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -136,7 +141,12 @@ func scrapeSingleURL(url string, site SiteConfig, config Config, results chan<-
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Printf("Successfully scraped content from %s (length: %d)\n", url, len(content))
|
if content == "" {
|
||||||
|
logger.Printf("Warning: Empty content scraped from %s\n", url)
|
||||||
|
} else {
|
||||||
|
logger.Printf("Successfully scraped content from %s (length: %d)\n", url, len(content))
|
||||||
|
}
|
||||||
|
|
||||||
results <- struct {
|
results <- struct {
|
||||||
url string
|
url string
|
||||||
content string
|
content string
|
||||||
|
|||||||
Reference in New Issue
Block a user