fix: resolve undefined types and import issues in scraper.go

This commit is contained in:
Arik Jones (aider)
2024-09-19 16:10:06 -05:00
parent 569ff9924d
commit e3fddf101c

View File

@@ -5,6 +5,7 @@ import (
"io/ioutil"
"log"
"math/rand"
"net/url"
"regexp"
"strings"
"time"
@@ -26,7 +27,7 @@ var (
// Config holds the scraper configuration
type Config struct {
URLs []URLConfig
Sites []SiteConfig
OutputType string
Verbose bool
Scrape ScrapeConfig
@@ -38,6 +39,25 @@ type ScrapeConfig struct {
BurstLimit int
}
// SiteConfig holds configuration for a single site
type SiteConfig struct {
BaseURL string
CSSLocator string
ExcludeSelectors []string
MaxDepth int
AllowedPaths []string
ExcludePaths []string
OutputAlias string
PathOverrides []PathOverride
}
// PathOverride holds path-specific overrides
type PathOverride struct {
Path string
CSSLocator string
ExcludeSelectors []string
}
func ScrapeSites(config Config) (map[string]string, error) {
results := make(chan struct {
url string
@@ -124,8 +144,8 @@ func scrapeSite(site SiteConfig, config Config, results chan<- struct {
}
}
func isAllowedURL(url string, site SiteConfig) bool {
parsedURL, err := url.Parse(url)
func isAllowedURL(urlStr string, site SiteConfig) bool {
parsedURL, err := url.Parse(urlStr)
if err != nil {
return false
}
@@ -150,8 +170,8 @@ func isAllowedURL(url string, site SiteConfig) bool {
return false
}
func getOverrides(url string, site SiteConfig) (string, []string) {
parsedURL, _ := url.Parse(url)
func getOverrides(urlStr string, site SiteConfig) (string, []string) {
parsedURL, _ := url.Parse(urlStr)
path := parsedURL.Path
for _, override := range site.PathOverrides {