mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 15:03:17 +00:00
feat: implement rate limiting for URL scraping
This commit is contained in:
@@ -15,8 +15,10 @@ type Config struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type ScrapeConfig struct {
|
type ScrapeConfig struct {
|
||||||
URLs []URLConfig `yaml:"urls"`
|
URLs []URLConfig `yaml:"urls"`
|
||||||
OutputType string `yaml:"output_type"`
|
OutputType string `yaml:"output_type"`
|
||||||
|
RequestsPerSecond float64 `yaml:"requests_per_second"`
|
||||||
|
BurstLimit int `yaml:"burst_limit"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type URLConfig struct {
|
type URLConfig struct {
|
||||||
|
|||||||
@@ -8,10 +8,13 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
"sync"
|
||||||
|
"context"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
md "github.com/JohannesKaufmann/html-to-markdown"
|
md "github.com/JohannesKaufmann/html-to-markdown"
|
||||||
|
"golang.org/x/time/rate"
|
||||||
)
|
)
|
||||||
|
|
||||||
var logger *log.Logger
|
var logger *log.Logger
|
||||||
@@ -36,8 +39,37 @@ func ScrapeMultipleURLs(config Config) (map[string]string, error) {
|
|||||||
err error
|
err error
|
||||||
}, len(config.URLs))
|
}, len(config.URLs))
|
||||||
|
|
||||||
|
// Use default values if not specified in the config
|
||||||
|
requestsPerSecond := 0.5 // Default to 1 request every 2 seconds
|
||||||
|
if config.Scrape.RequestsPerSecond > 0 {
|
||||||
|
requestsPerSecond = config.Scrape.RequestsPerSecond
|
||||||
|
}
|
||||||
|
|
||||||
|
burstLimit := 1 // Default to 1
|
||||||
|
if config.Scrape.BurstLimit > 0 {
|
||||||
|
burstLimit = config.Scrape.BurstLimit
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a rate limiter based on the configuration
|
||||||
|
limiter := rate.NewLimiter(rate.Limit(requestsPerSecond), burstLimit)
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
for _, urlConfig := range config.URLs {
|
for _, urlConfig := range config.URLs {
|
||||||
|
wg.Add(1)
|
||||||
go func(cfg URLConfig) {
|
go func(cfg URLConfig) {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
// Wait for rate limiter before making the request
|
||||||
|
err := limiter.Wait(context.Background())
|
||||||
|
if err != nil {
|
||||||
|
results <- struct {
|
||||||
|
url string
|
||||||
|
content string
|
||||||
|
err error
|
||||||
|
}{cfg.URL, "", fmt.Errorf("rate limiter error: %v", err)}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
content, err := scrapeURL(cfg)
|
content, err := scrapeURL(cfg)
|
||||||
results <- struct {
|
results <- struct {
|
||||||
url string
|
url string
|
||||||
@@ -47,9 +79,13 @@ func ScrapeMultipleURLs(config Config) (map[string]string, error) {
|
|||||||
}(urlConfig)
|
}(urlConfig)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
wg.Wait()
|
||||||
|
close(results)
|
||||||
|
}()
|
||||||
|
|
||||||
scrapedContent := make(map[string]string)
|
scrapedContent := make(map[string]string)
|
||||||
for i := 0; i < len(config.URLs); i++ {
|
for result := range results {
|
||||||
result := <-results
|
|
||||||
if result.err != nil {
|
if result.err != nil {
|
||||||
logger.Printf("Error scraping %s: %v\n", result.url, result.err)
|
logger.Printf("Error scraping %s: %v\n", result.url, result.err)
|
||||||
continue
|
continue
|
||||||
|
|||||||
Reference in New Issue
Block a user