From 569ff9924dfee295ee415f9c1c417951ee3d1522 Mon Sep 17 00:00:00 2001 From: "Arik Jones (aider)" Date: Thu, 19 Sep 2024 16:06:55 -0500 Subject: [PATCH] feat: implement site-based scraping with path overrides --- cmd/web.go | 74 +++++++------ internal/config/config.go | 28 +++-- internal/scraper/scraper.go | 203 +++++++++++++++++++++++------------- 3 files changed, 195 insertions(+), 110 deletions(-) diff --git a/cmd/web.go b/cmd/web.go index 935f704..94aa3e3 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -38,41 +38,53 @@ func init() { } func runWeb(cmd *cobra.Command, args []string) error { - scraperConfig.Verbose = verbose + scraperConfig.Verbose = verbose - // Use config if available, otherwise use command-line flags - var urlConfigs []scraper.URLConfig - if len(urls) == 0 && len(cfg.Scrape.URLs) > 0 { - urlConfigs = make([]scraper.URLConfig, len(cfg.Scrape.URLs)) - for i, u := range cfg.Scrape.URLs { - urlConfigs[i] = scraper.URLConfig{ - URL: u.URL, - CSSLocator: u.CSSLocator, - ExcludeSelectors: u.ExcludeSelectors, - OutputAlias: u.OutputAlias, - } - } - } else { - urlConfigs = make([]scraper.URLConfig, len(urls)) - for i, u := range urls { - urlConfigs[i] = scraper.URLConfig{URL: u, CSSLocator: includeSelector} - } - } + var siteConfigs []scraper.SiteConfig + if len(cfg.Scrape.Sites) > 0 { + siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites)) + for i, site := range cfg.Scrape.Sites { + siteConfigs[i] = scraper.SiteConfig{ + BaseURL: site.BaseURL, + CSSLocator: site.CSSLocator, + ExcludeSelectors: site.ExcludeSelectors, + MaxDepth: site.MaxDepth, + AllowedPaths: site.AllowedPaths, + ExcludePaths: site.ExcludePaths, + OutputAlias: site.OutputAlias, + PathOverrides: site.PathOverrides, + } + } + } else { + // Fallback to URL-based configuration if no sites are defined + siteConfigs = make([]scraper.SiteConfig, len(urls)) + for i, u := range urls { + siteConfigs[i] = scraper.SiteConfig{ + BaseURL: u, + CSSLocator: includeSelector, + ExcludeSelectors: excludeSelectors, + } + } + } - if len(urlConfigs) == 0 { - return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.urls' in the rollup.yml file") - } + if len(siteConfigs) == 0 { + return fmt.Errorf("no sites or URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.sites' in the rollup.yml file") + } - scraperConfig := scraper.Config{ - URLs: urlConfigs, - OutputType: outputType, - Verbose: verbose, - } + scraperConfig := scraper.Config{ + Sites: siteConfigs, + OutputType: cfg.Scrape.OutputType, + Verbose: verbose, + Scrape: scraper.ScrapeConfig{ + RequestsPerSecond: cfg.Scrape.RequestsPerSecond, + BurstLimit: cfg.Scrape.BurstLimit, + }, + } - scrapedContent, err := scraper.ScrapeMultipleURLs(scraperConfig) - if err != nil { - return fmt.Errorf("error scraping content: %v", err) - } + scrapedContent, err := scraper.ScrapeSites(scraperConfig) + if err != nil { + return fmt.Errorf("error scraping content: %v", err) + } if outputType == "single" { return writeSingleFile(scrapedContent) diff --git a/internal/config/config.go b/internal/config/config.go index cdd1a88..0042396 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -15,17 +15,27 @@ type Config struct { } type ScrapeConfig struct { - URLs []URLConfig `yaml:"urls"` - OutputType string `yaml:"output_type"` - RequestsPerSecond float64 `yaml:"requests_per_second"` - BurstLimit int `yaml:"burst_limit"` + Sites []SiteConfig `yaml:"sites"` + OutputType string `yaml:"output_type"` + RequestsPerSecond float64 `yaml:"requests_per_second"` + BurstLimit int `yaml:"burst_limit"` } -type URLConfig struct { - URL string `yaml:"url"` - CSSLocator string `yaml:"css_locator"` - ExcludeSelectors []string `yaml:"exclude_selectors"` - OutputAlias string `yaml:"output_alias"` +type SiteConfig struct { + BaseURL string `yaml:"base_url"` + CSSLocator string `yaml:"css_locator"` + ExcludeSelectors []string `yaml:"exclude_selectors"` + MaxDepth int `yaml:"max_depth"` + AllowedPaths []string `yaml:"allowed_paths"` + ExcludePaths []string `yaml:"exclude_paths"` + OutputAlias string `yaml:"output_alias"` + PathOverrides []PathOverride `yaml:"path_overrides"` +} + +type PathOverride struct { + Path string `yaml:"path"` + CSSLocator string `yaml:"css_locator"` + ExcludeSelectors []string `yaml:"exclude_selectors"` } func Load(configPath string) (*Config, error) { diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index bf627a6..cba9a7d 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -38,85 +38,148 @@ type ScrapeConfig struct { BurstLimit int } -// ScrapeMultipleURLs scrapes multiple URLs concurrently -func ScrapeMultipleURLs(config Config) (map[string]string, error) { - results := make(chan struct { - url string - content string - err error - }, len(config.URLs)) +func ScrapeSites(config Config) (map[string]string, error) { + results := make(chan struct { + url string + content string + err error + }) - // Use default values if not specified in the config - requestsPerSecond := 0.5 // Default to 1 request every 2 seconds - if config.Scrape.RequestsPerSecond > 0 { - requestsPerSecond = config.Scrape.RequestsPerSecond - } + limiter := rate.NewLimiter(rate.Limit(config.Scrape.RequestsPerSecond), config.Scrape.BurstLimit) - burstLimit := 1 // Default to 1 - if config.Scrape.BurstLimit > 0 { - burstLimit = config.Scrape.BurstLimit - } + var wg sync.WaitGroup + for _, site := range config.Sites { + wg.Add(1) + go func(site SiteConfig) { + defer wg.Done() + scrapeSite(site, config, results, limiter) + }(site) + } - // Create a rate limiter based on the configuration - limiter := rate.NewLimiter(rate.Limit(requestsPerSecond), burstLimit) + go func() { + wg.Wait() + close(results) + }() - var wg sync.WaitGroup - for _, urlConfig := range config.URLs { - wg.Add(1) - go func(cfg URLConfig) { - defer wg.Done() - - // Wait for rate limiter before making the request - err := limiter.Wait(context.Background()) - if err != nil { - results <- struct { - url string - content string - err error - }{cfg.URL, "", fmt.Errorf("rate limiter error: %v", err)} - return - } + scrapedContent := make(map[string]string) + for result := range results { + if result.err != nil { + logger.Printf("Error scraping %s: %v\n", result.url, result.err) + continue + } + scrapedContent[result.url] = result.content + } - content, err := scrapeURL(cfg) - results <- struct { - url string - content string - err error - }{cfg.URL, content, err} - }(urlConfig) - } - - go func() { - wg.Wait() - close(results) - }() - - scrapedContent := make(map[string]string) - for result := range results { - if result.err != nil { - logger.Printf("Error scraping %s: %v\n", result.url, result.err) - continue - } - scrapedContent[result.url] = result.content - } - - return scrapedContent, nil + return scrapedContent, nil } -func scrapeURL(config URLConfig) (string, error) { - content, err := FetchWebpageContent(config.URL) - if err != nil { - return "", err - } +func scrapeSite(site SiteConfig, config Config, results chan<- struct { + url string + content string + err error +}, limiter *rate.Limiter) { + visited := make(map[string]bool) + queue := []string{site.BaseURL} - if config.CSSLocator != "" { - content, err = ExtractContentWithCSS(content, config.CSSLocator, config.ExcludeSelectors) - if err != nil { - return "", err - } - } + for len(queue) > 0 { + url := queue[0] + queue = queue[1:] - return ProcessHTMLContent(content, Config{}) + if visited[url] { + continue + } + visited[url] = true + + if !isAllowedURL(url, site) { + continue + } + + // Wait for rate limiter before making the request + err := limiter.Wait(context.Background()) + if err != nil { + results <- struct { + url string + content string + err error + }{url, "", fmt.Errorf("rate limiter error: %v", err)} + continue + } + + cssLocator, excludeSelectors := getOverrides(url, site) + content, err := scrapeURL(url, cssLocator, excludeSelectors) + results <- struct { + url string + content string + err error + }{url, content, err} + + if len(visited) < site.MaxDepth { + links, _ := ExtractLinks(url) + for _, link := range links { + if !visited[link] && isAllowedURL(link, site) { + queue = append(queue, link) + } + } + } + } +} + +func isAllowedURL(url string, site SiteConfig) bool { + parsedURL, err := url.Parse(url) + if err != nil { + return false + } + + baseURL, _ := url.Parse(site.BaseURL) + if parsedURL.Host != baseURL.Host { + return false + } + + path := parsedURL.Path + for _, allowedPath := range site.AllowedPaths { + if strings.HasPrefix(path, allowedPath) { + for _, excludePath := range site.ExcludePaths { + if strings.HasPrefix(path, excludePath) { + return false + } + } + return true + } + } + + return false +} + +func getOverrides(url string, site SiteConfig) (string, []string) { + parsedURL, _ := url.Parse(url) + path := parsedURL.Path + + for _, override := range site.PathOverrides { + if strings.HasPrefix(path, override.Path) { + if override.CSSLocator != "" { + return override.CSSLocator, override.ExcludeSelectors + } + return site.CSSLocator, override.ExcludeSelectors + } + } + + return site.CSSLocator, site.ExcludeSelectors +} + +func scrapeURL(url, cssLocator string, excludeSelectors []string) (string, error) { + content, err := FetchWebpageContent(url) + if err != nil { + return "", err + } + + if cssLocator != "" { + content, err = ExtractContentWithCSS(content, cssLocator, excludeSelectors) + if err != nil { + return "", err + } + } + + return ProcessHTMLContent(content, Config{}) } func getFilenameFromContent(content, url string) string {