diff --git a/README.md b/README.md index dc9f8e4..eb095ce 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Rollup aggregates the contents of text-based files and webpages into a markdown - Flexible configuration file support (YAML) - Automatic generation of default configuration file - Custom output file naming -- Concurrent processing for improved performance +- Rate limiting for web scraping to respect server resources ## Installation @@ -66,37 +66,36 @@ Rollup can be configured using a YAML file. By default, it looks for `rollup.yml Example `rollup.yml`: ```yaml -file_types: +file_extensions: - go - md -ignore: +ignore_paths: - node_modules/** - vendor/** - .git/** -code_generated: +code_generated_paths: - **/generated/** -scrape: - sites: - - base_url: https://example.com - css_locator: .content - exclude_selectors: - - .ads - - .navigation - max_depth: 2 - allowed_paths: - - /blog - - /docs - exclude_paths: - - /admin - output_alias: example - path_overrides: - - path: /special-page - css_locator: .special-content - exclude_selectors: - - .special-ads - output_type: single - requests_per_second: 1.0 - burst_limit: 3 +sites: + - base_url: https://example.com + css_locator: .content + exclude_selectors: + - .ads + - .navigation + max_depth: 2 + allowed_paths: + - /blog + - /docs + exclude_paths: + - /admin + output_alias: example + path_overrides: + - path: /special-page + css_locator: .special-content + exclude_selectors: + - .special-ads +output_type: single +requests_per_second: 1.0 +burst_limit: 3 ``` ## Examples @@ -107,10 +106,10 @@ scrape: rollup files ``` -2. Web scraping with multiple URLs and increased concurrency: +2. Web scraping with multiple URLs: ```bash - rollup web --urls=https://example.com,https://another-example.com --concurrent=8 + rollup web --urls=https://example.com,https://another-example.com ``` 3. Generate a default configuration file: @@ -119,20 +118,22 @@ scrape: rollup generate ``` -4. Use a custom configuration file and specify output: +4. Use a custom configuration file: ```bash - rollup files --config=my-config.yml --output=project_summary.md + rollup files --config=my-config.yml ``` -5. Web scraping with separate output files and custom timeout: +5. Web scraping with separate output files: + ```bash - rollup web --urls=https://example.com,https://another-example.com --output=separate --timeout=60 + rollup web --urls=https://example.com,https://another-example.com --output=separate ``` 6. Rollup files with specific types and ignore patterns: + ```bash - rollup files --types=.go,.md --ignore=vendor/**,*_test.go + rollup files --types=go,md --ignore=vendor/**,*_test.go ``` 7. Web scraping with depth and CSS selector: diff --git a/cmd/files.go b/cmd/files.go index 70ae3b6..26666e7 100644 --- a/cmd/files.go +++ b/cmd/files.go @@ -108,19 +108,20 @@ func isIgnored(filePath string, patterns []string) bool { func runRollup(cfg *config.Config) error { // Use config if available, otherwise use command-line flags - var types, codeGenList, ignoreList []string - if cfg != nil && len(cfg.FileTypes) > 0 { - types = cfg.FileTypes + var types []string + var codeGenList, ignoreList []string + if cfg != nil && len(cfg.FileExtensions) > 0 { + types = cfg.FileExtensions } else { types = strings.Split(fileTypes, ",") } - if cfg != nil && len(cfg.CodeGenerated) > 0 { - codeGenList = cfg.CodeGenerated + if cfg != nil && len(cfg.CodeGeneratedPaths) > 0 { + codeGenList = cfg.CodeGeneratedPaths } else { codeGenList = strings.Split(codeGenPatterns, ",") } - if cfg != nil && cfg.Ignore != nil && len(cfg.Ignore) > 0 { - ignoreList = cfg.Ignore + if cfg != nil && len(cfg.IgnorePaths) > 0 { + ignoreList = cfg.IgnorePaths } else { ignoreList = strings.Split(ignorePatterns, ",") } diff --git a/cmd/files_test.go b/cmd/files_test.go index 13ef72f..3a1a210 100644 --- a/cmd/files_test.go +++ b/cmd/files_test.go @@ -112,9 +112,9 @@ func TestRunRollup(t *testing.T) { // Set up test configuration cfg = &config.Config{ - FileTypes: []string{"go", "txt", "md"}, - Ignore: []string{"*.json", ".git/**", "vendor/**"}, - CodeGenerated: []string{"generated_*.go"}, + FileExtensions: []string{"go", "txt", "md"}, + IgnorePaths: []string{"*.json", ".git/**", "vendor/**"}, + CodeGeneratedPaths: []string{"generated_*.go"}, } // Change working directory to the temp directory diff --git a/cmd/generate.go b/cmd/generate.go index a872753..e8805b8 100644 --- a/cmd/generate.go +++ b/cmd/generate.go @@ -38,23 +38,23 @@ func runGenerate(cmd *cobra.Command, args []string) error { } cfg := config.Config{ - FileTypes: make([]string, 0, len(fileTypes)), - Ignore: []string{"node_modules/**", "vendor/**", ".git/**"}, + FileExtensions: make([]string, 0, len(fileTypes)), + IgnorePaths: []string{"node_modules/**", "vendor/**", ".git/**"}, } for ext := range fileTypes { - cfg.FileTypes = append(cfg.FileTypes, ext) + cfg.FileExtensions = append(cfg.FileExtensions, ext) } // Sort file types for consistency - sort.Strings(cfg.FileTypes) + sort.Strings(cfg.FileExtensions) yamlData, err := yaml.Marshal(&cfg) if err != nil { return fmt.Errorf("error marshaling config: %v", err) } - outputPath := config.DefaultConfigPath() + outputPath := "rollup.yml" err = os.WriteFile(outputPath, yamlData, 0644) if err != nil { return fmt.Errorf("error writing config file: %v", err) diff --git a/cmd/web.go b/cmd/web.go index ecc24d4..2218593 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -50,10 +50,10 @@ func runWeb(cmd *cobra.Command, args []string) error { scraperConfig.Verbose = verbose var siteConfigs []scraper.SiteConfig - if len(cfg.Scrape.Sites) > 0 { - logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites)) - siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites)) - for i, site := range cfg.Scrape.Sites { + if len(cfg.Sites) > 0 { + logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Sites)) + siteConfigs = make([]scraper.SiteConfig, len(cfg.Sites)) + for i, site := range cfg.Sites { siteConfigs[i] = scraper.SiteConfig{ BaseURL: site.BaseURL, CSSLocator: site.CSSLocator, @@ -92,13 +92,13 @@ func runWeb(cmd *cobra.Command, args []string) error { defaultBurstLimit := 3 // Use default values if not set in the configuration - requestsPerSecond := cfg.Scrape.RequestsPerSecond - if requestsPerSecond == 0 { - requestsPerSecond = defaultRequestsPerSecond + requestsPerSecond := defaultRequestsPerSecond + if cfg.RequestsPerSecond != nil { + requestsPerSecond = *cfg.RequestsPerSecond } - burstLimit := cfg.Scrape.BurstLimit - if burstLimit == 0 { - burstLimit = defaultBurstLimit + burstLimit := defaultBurstLimit + if cfg.BurstLimit != nil { + burstLimit = *cfg.BurstLimit } scraperConfig := scraper.Config{ diff --git a/internal/config/config.go b/internal/config/config.go index 0042396..d4e9198 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -8,34 +8,63 @@ import ( ) type Config struct { - FileTypes []string `yaml:"file_types"` - Ignore []string `yaml:"ignore"` - CodeGenerated []string `yaml:"code_generated"` - Scrape ScrapeConfig `yaml:"scrape"` -} + // FileExtensions is a list of file extensions to include in the rollup + FileExtensions []string `yaml:"file_extensions"` -type ScrapeConfig struct { - Sites []SiteConfig `yaml:"sites"` - OutputType string `yaml:"output_type"` - RequestsPerSecond float64 `yaml:"requests_per_second"` - BurstLimit int `yaml:"burst_limit"` + // IgnorePaths is a list of glob patterns for paths to ignore + IgnorePaths []string `yaml:"ignore_paths"` + + // CodeGeneratedPaths is a list of glob patterns for code-generated files + CodeGeneratedPaths []string `yaml:"code_generated_paths"` + + // Sites is a list of site configurations for web scraping + Sites []SiteConfig `yaml:"sites"` + + // OutputType specifies how the output should be generated + OutputType string `yaml:"output_type"` + + // RequestsPerSecond limits the rate of web requests + RequestsPerSecond *float64 `yaml:"requests_per_second,omitempty"` + + // BurstLimit sets the maximum burst size for rate limiting + BurstLimit *int `yaml:"burst_limit,omitempty"` } type SiteConfig struct { - BaseURL string `yaml:"base_url"` - CSSLocator string `yaml:"css_locator"` - ExcludeSelectors []string `yaml:"exclude_selectors"` - MaxDepth int `yaml:"max_depth"` - AllowedPaths []string `yaml:"allowed_paths"` - ExcludePaths []string `yaml:"exclude_paths"` - OutputAlias string `yaml:"output_alias"` - PathOverrides []PathOverride `yaml:"path_overrides"` + // BaseURL is the starting point for scraping this site + BaseURL string `yaml:"base_url"` + + // CSSLocator is used to extract specific content + CSSLocator string `yaml:"css_locator"` + + // ExcludeSelectors lists CSS selectors for content to exclude + ExcludeSelectors []string `yaml:"exclude_selectors"` + + // MaxDepth sets the maximum depth for link traversal + MaxDepth int `yaml:"max_depth"` + + // AllowedPaths lists paths that are allowed to be scraped + AllowedPaths []string `yaml:"allowed_paths"` + + // ExcludePaths lists paths that should not be scraped + ExcludePaths []string `yaml:"exclude_paths"` + + // OutputAlias provides an alternative name for output files + OutputAlias string `yaml:"output_alias"` + + // PathOverrides allows for path-specific configurations + PathOverrides []PathOverride `yaml:"path_overrides"` } type PathOverride struct { - Path string `yaml:"path"` - CSSLocator string `yaml:"css_locator"` - ExcludeSelectors []string `yaml:"exclude_selectors"` + // Path is the URL path this override applies to + Path string `yaml:"path"` + + // CSSLocator overrides the site-wide CSS locator for this path + CSSLocator string `yaml:"css_locator"` + + // ExcludeSelectors overrides the site-wide exclude selectors for this path + ExcludeSelectors []string `yaml:"exclude_selectors"` } func Load(configPath string) (*Config, error) { @@ -50,15 +79,34 @@ func Load(configPath string) (*Config, error) { return nil, fmt.Errorf("error parsing config file: %v", err) } + if err := config.Validate(); err != nil { + return nil, fmt.Errorf("invalid configuration: %v", err) + } + return &config, nil } -func DefaultConfigPath() string { - return "rollup.yml" -} +func (c *Config) Validate() error { + if len(c.FileExtensions) == 0 { + return fmt.Errorf("at least one file extension must be specified") + } -func FileExists(filename string) bool { - _, err := os.Stat(filename) - return err == nil -} + if c.RequestsPerSecond != nil && *c.RequestsPerSecond <= 0 { + return fmt.Errorf("requests_per_second must be positive") + } + if c.BurstLimit != nil && *c.BurstLimit <= 0 { + return fmt.Errorf("burst_limit must be positive") + } + + for _, site := range c.Sites { + if site.BaseURL == "" { + return fmt.Errorf("base_url must be specified for each site") + } + if site.MaxDepth < 0 { + return fmt.Errorf("max_depth must be non-negative") + } + } + + return nil +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 483c333..8c1c0b8 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -9,34 +9,33 @@ import ( func TestLoad(t *testing.T) { // Create a temporary config file content := []byte(` -file_types: - - go - - md -ignore: +file_extensions: + - .go + - .md +ignore_paths: - "*.tmp" - "**/*.log" -code_generated: +code_generated_paths: - "generated_*.go" -scrape: - sites: - - base_url: "https://example.com" - css_locator: "main" - exclude_selectors: - - ".ads" - max_depth: 2 - allowed_paths: - - "/blog" - exclude_paths: - - "/admin" - output_alias: "example" - path_overrides: - - path: "/special" - css_locator: ".special-content" - exclude_selectors: - - ".sidebar" - output_type: "single" - requests_per_second: 1.0 - burst_limit: 5 +sites: + - base_url: "https://example.com" + css_locator: "main" + exclude_selectors: + - ".ads" + max_depth: 2 + allowed_paths: + - "/blog" + exclude_paths: + - "/admin" + output_alias: "example" + path_overrides: + - path: "/special" + css_locator: ".special-content" + exclude_selectors: + - ".sidebar" +output_type: "single" +requests_per_second: 1.0 +burst_limit: 5 `) tmpfile, err := os.CreateTemp("", "config*.yml") @@ -59,33 +58,33 @@ scrape: } // Check if the loaded config matches the expected values + rps := 1.0 + bl := 5 expectedConfig := &Config{ - FileTypes: []string{"go", "md"}, - Ignore: []string{"*.tmp", "**/*.log"}, - CodeGenerated: []string{"generated_*.go"}, - Scrape: ScrapeConfig{ - Sites: []SiteConfig{ - { - BaseURL: "https://example.com", - CSSLocator: "main", - ExcludeSelectors: []string{".ads"}, - MaxDepth: 2, - AllowedPaths: []string{"/blog"}, - ExcludePaths: []string{"/admin"}, - OutputAlias: "example", - PathOverrides: []PathOverride{ - { - Path: "/special", - CSSLocator: ".special-content", - ExcludeSelectors: []string{".sidebar"}, - }, + FileExtensions: []string{".go", ".md"}, + IgnorePaths: []string{"*.tmp", "**/*.log"}, + CodeGeneratedPaths: []string{"generated_*.go"}, + Sites: []SiteConfig{ + { + BaseURL: "https://example.com", + CSSLocator: "main", + ExcludeSelectors: []string{".ads"}, + MaxDepth: 2, + AllowedPaths: []string{"/blog"}, + ExcludePaths: []string{"/admin"}, + OutputAlias: "example", + PathOverrides: []PathOverride{ + { + Path: "/special", + CSSLocator: ".special-content", + ExcludeSelectors: []string{".sidebar"}, }, }, }, - OutputType: "single", - RequestsPerSecond: 1.0, - BurstLimit: 5, }, + OutputType: "single", + RequestsPerSecond: &rps, + BurstLimit: &bl, } if !reflect.DeepEqual(config, expectedConfig) { @@ -93,28 +92,67 @@ scrape: } } -func TestDefaultConfigPath(t *testing.T) { - expected := "rollup.yml" - result := DefaultConfigPath() - if result != expected { - t.Errorf("DefaultConfigPath() = %q, want %q", result, expected) - } -} - -func TestFileExists(t *testing.T) { - // Test with an existing file - tmpfile, err := os.CreateTemp("", "testfile") - if err != nil { - t.Fatalf("Failed to create temp file: %v", err) - } - defer os.Remove(tmpfile.Name()) - - if !FileExists(tmpfile.Name()) { - t.Errorf("FileExists(%q) = false, want true", tmpfile.Name()) - } - - // Test with a non-existing file - if FileExists("non_existing_file.txt") { - t.Errorf("FileExists(\"non_existing_file.txt\") = true, want false") +func TestValidate(t *testing.T) { + tests := []struct { + name string + config Config + wantErr bool + }{ + { + name: "Valid config", + config: Config{ + FileExtensions: []string{".go"}, + Sites: []SiteConfig{ + {BaseURL: "https://example.com", MaxDepth: 2}, + }, + }, + wantErr: false, + }, + { + name: "No file extensions", + config: Config{}, + wantErr: true, + }, + { + name: "Invalid requests per second", + config: Config{ + FileExtensions: []string{".go"}, + RequestsPerSecond: func() *float64 { f := -1.0; return &f }(), + }, + wantErr: true, + }, + { + name: "Invalid burst limit", + config: Config{ + FileExtensions: []string{".go"}, + BurstLimit: func() *int { i := -1; return &i }(), + }, + wantErr: true, + }, + { + name: "Site without base URL", + config: Config{ + FileExtensions: []string{".go"}, + Sites: []SiteConfig{{}}, + }, + wantErr: true, + }, + { + name: "Negative max depth", + config: Config{ + FileExtensions: []string{".go"}, + Sites: []SiteConfig{{BaseURL: "https://example.com", MaxDepth: -1}}, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.config.Validate() + if (err != nil) != tt.wantErr { + t.Errorf("Validate() error = %v, wantErr %v", err, tt.wantErr) + } + }) } } diff --git a/main.go b/main.go index 0f7d9c7..e8ad8a0 100644 --- a/main.go +++ b/main.go @@ -20,7 +20,7 @@ func main() { var err error if !isHelpCommand { - configPath := config.DefaultConfigPath() + configPath := "rollup.yml" cfg, err = config.Load(configPath) if err != nil { log.Printf("Warning: Failed to load configuration: %v", err)