mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 15:03:17 +00:00
fix: resolve undefined types and import issues in scraper.go
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
|||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
|
"net/url"
|
||||||
"regexp"
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
@@ -26,7 +27,7 @@ var (
|
|||||||
|
|
||||||
// Config holds the scraper configuration
|
// Config holds the scraper configuration
|
||||||
type Config struct {
|
type Config struct {
|
||||||
URLs []URLConfig
|
Sites []SiteConfig
|
||||||
OutputType string
|
OutputType string
|
||||||
Verbose bool
|
Verbose bool
|
||||||
Scrape ScrapeConfig
|
Scrape ScrapeConfig
|
||||||
@@ -38,6 +39,25 @@ type ScrapeConfig struct {
|
|||||||
BurstLimit int
|
BurstLimit int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SiteConfig holds configuration for a single site
|
||||||
|
type SiteConfig struct {
|
||||||
|
BaseURL string
|
||||||
|
CSSLocator string
|
||||||
|
ExcludeSelectors []string
|
||||||
|
MaxDepth int
|
||||||
|
AllowedPaths []string
|
||||||
|
ExcludePaths []string
|
||||||
|
OutputAlias string
|
||||||
|
PathOverrides []PathOverride
|
||||||
|
}
|
||||||
|
|
||||||
|
// PathOverride holds path-specific overrides
|
||||||
|
type PathOverride struct {
|
||||||
|
Path string
|
||||||
|
CSSLocator string
|
||||||
|
ExcludeSelectors []string
|
||||||
|
}
|
||||||
|
|
||||||
func ScrapeSites(config Config) (map[string]string, error) {
|
func ScrapeSites(config Config) (map[string]string, error) {
|
||||||
results := make(chan struct {
|
results := make(chan struct {
|
||||||
url string
|
url string
|
||||||
@@ -124,8 +144,8 @@ func scrapeSite(site SiteConfig, config Config, results chan<- struct {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func isAllowedURL(url string, site SiteConfig) bool {
|
func isAllowedURL(urlStr string, site SiteConfig) bool {
|
||||||
parsedURL, err := url.Parse(url)
|
parsedURL, err := url.Parse(urlStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
@@ -150,8 +170,8 @@ func isAllowedURL(url string, site SiteConfig) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func getOverrides(url string, site SiteConfig) (string, []string) {
|
func getOverrides(urlStr string, site SiteConfig) (string, []string) {
|
||||||
parsedURL, _ := url.Parse(url)
|
parsedURL, _ := url.Parse(urlStr)
|
||||||
path := parsedURL.Path
|
path := parsedURL.Path
|
||||||
|
|
||||||
for _, override := range site.PathOverrides {
|
for _, override := range site.PathOverrides {
|
||||||
|
|||||||
Reference in New Issue
Block a user