diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index cba9a7d..0a96dd6 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -5,6 +5,7 @@ import ( "io/ioutil" "log" "math/rand" + "net/url" "regexp" "strings" "time" @@ -26,7 +27,7 @@ var ( // Config holds the scraper configuration type Config struct { - URLs []URLConfig + Sites []SiteConfig OutputType string Verbose bool Scrape ScrapeConfig @@ -38,6 +39,25 @@ type ScrapeConfig struct { BurstLimit int } +// SiteConfig holds configuration for a single site +type SiteConfig struct { + BaseURL string + CSSLocator string + ExcludeSelectors []string + MaxDepth int + AllowedPaths []string + ExcludePaths []string + OutputAlias string + PathOverrides []PathOverride +} + +// PathOverride holds path-specific overrides +type PathOverride struct { + Path string + CSSLocator string + ExcludeSelectors []string +} + func ScrapeSites(config Config) (map[string]string, error) { results := make(chan struct { url string @@ -124,8 +144,8 @@ func scrapeSite(site SiteConfig, config Config, results chan<- struct { } } -func isAllowedURL(url string, site SiteConfig) bool { - parsedURL, err := url.Parse(url) +func isAllowedURL(urlStr string, site SiteConfig) bool { + parsedURL, err := url.Parse(urlStr) if err != nil { return false } @@ -150,8 +170,8 @@ func isAllowedURL(url string, site SiteConfig) bool { return false } -func getOverrides(url string, site SiteConfig) (string, []string) { - parsedURL, _ := url.Parse(url) +func getOverrides(urlStr string, site SiteConfig) (string, []string) { + parsedURL, _ := url.Parse(urlStr) path := parsedURL.Path for _, override := range site.PathOverrides {