mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 23:13:22 +00:00
flatten scrape config to 'sites:'
* flatten scrape config to 'sites:'. Update unit tests and readme. * remove check for file_extensions configuration. * show progress indication after 5 seconds. * add documentation to functions * fix: remove MaxDepth and link extraction functionality * fix: Remove MaxDepth references from cmd/web.go
This commit is contained in:
@@ -7,35 +7,64 @@ import (
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
// Config represents the configuration for the rollup tool
|
||||
type Config struct {
|
||||
FileTypes []string `yaml:"file_types"`
|
||||
Ignore []string `yaml:"ignore"`
|
||||
CodeGenerated []string `yaml:"code_generated"`
|
||||
Scrape ScrapeConfig `yaml:"scrape"`
|
||||
}
|
||||
|
||||
type ScrapeConfig struct {
|
||||
Sites []SiteConfig `yaml:"sites"`
|
||||
OutputType string `yaml:"output_type"`
|
||||
RequestsPerSecond float64 `yaml:"requests_per_second"`
|
||||
BurstLimit int `yaml:"burst_limit"`
|
||||
// FileExtensions is a list of file extensions to include in the rollup
|
||||
FileExtensions []string `yaml:"file_extensions"`
|
||||
|
||||
// IgnorePaths is a list of glob patterns for paths to ignore
|
||||
IgnorePaths []string `yaml:"ignore_paths"`
|
||||
|
||||
// CodeGeneratedPaths is a list of glob patterns for code-generated files
|
||||
CodeGeneratedPaths []string `yaml:"code_generated_paths"`
|
||||
|
||||
// Sites is a list of site configurations for web scraping
|
||||
Sites []SiteConfig `yaml:"sites"`
|
||||
|
||||
// OutputType specifies how the output should be generated
|
||||
OutputType string `yaml:"output_type"`
|
||||
|
||||
// RequestsPerSecond limits the rate of web requests
|
||||
RequestsPerSecond *float64 `yaml:"requests_per_second,omitempty"`
|
||||
|
||||
// BurstLimit sets the maximum burst size for rate limiting
|
||||
BurstLimit *int `yaml:"burst_limit,omitempty"`
|
||||
}
|
||||
|
||||
// SiteConfig contains configuration for scraping a single site
|
||||
type SiteConfig struct {
|
||||
BaseURL string `yaml:"base_url"`
|
||||
CSSLocator string `yaml:"css_locator"`
|
||||
ExcludeSelectors []string `yaml:"exclude_selectors"`
|
||||
MaxDepth int `yaml:"max_depth"`
|
||||
AllowedPaths []string `yaml:"allowed_paths"`
|
||||
ExcludePaths []string `yaml:"exclude_paths"`
|
||||
OutputAlias string `yaml:"output_alias"`
|
||||
PathOverrides []PathOverride `yaml:"path_overrides"`
|
||||
// BaseURL is the starting point for scraping this site
|
||||
BaseURL string `yaml:"base_url"`
|
||||
|
||||
// CSSLocator is used to extract specific content
|
||||
CSSLocator string `yaml:"css_locator"`
|
||||
|
||||
// ExcludeSelectors lists CSS selectors for content to exclude
|
||||
ExcludeSelectors []string `yaml:"exclude_selectors"`
|
||||
|
||||
// AllowedPaths lists paths that are allowed to be scraped
|
||||
AllowedPaths []string `yaml:"allowed_paths"`
|
||||
|
||||
// ExcludePaths lists paths that should not be scraped
|
||||
ExcludePaths []string `yaml:"exclude_paths"`
|
||||
|
||||
// OutputAlias provides an alternative name for output files
|
||||
OutputAlias string `yaml:"output_alias"`
|
||||
|
||||
// PathOverrides allows for path-specific configurations
|
||||
PathOverrides []PathOverride `yaml:"path_overrides"`
|
||||
}
|
||||
|
||||
// PathOverride allows for path-specific configurations
|
||||
type PathOverride struct {
|
||||
Path string `yaml:"path"`
|
||||
CSSLocator string `yaml:"css_locator"`
|
||||
ExcludeSelectors []string `yaml:"exclude_selectors"`
|
||||
// Path is the URL path this override applies to
|
||||
Path string `yaml:"path"`
|
||||
|
||||
// CSSLocator overrides the site-wide CSS locator for this path
|
||||
CSSLocator string `yaml:"css_locator"`
|
||||
|
||||
// ExcludeSelectors overrides the site-wide exclude selectors for this path
|
||||
ExcludeSelectors []string `yaml:"exclude_selectors"`
|
||||
}
|
||||
|
||||
func Load(configPath string) (*Config, error) {
|
||||
@@ -50,15 +79,28 @@ func Load(configPath string) (*Config, error) {
|
||||
return nil, fmt.Errorf("error parsing config file: %v", err)
|
||||
}
|
||||
|
||||
if err := config.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("invalid configuration: %v", err)
|
||||
}
|
||||
|
||||
return &config, nil
|
||||
}
|
||||
|
||||
func DefaultConfigPath() string {
|
||||
return "rollup.yml"
|
||||
}
|
||||
// Validate checks the configuration for any invalid values
|
||||
func (c *Config) Validate() error {
|
||||
if c.RequestsPerSecond != nil && *c.RequestsPerSecond <= 0 {
|
||||
return fmt.Errorf("requests_per_second must be positive")
|
||||
}
|
||||
|
||||
func FileExists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
if c.BurstLimit != nil && *c.BurstLimit <= 0 {
|
||||
return fmt.Errorf("burst_limit must be positive")
|
||||
}
|
||||
|
||||
for _, site := range c.Sites {
|
||||
if site.BaseURL == "" {
|
||||
return fmt.Errorf("base_url must be specified for each site")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user