diff --git a/cmd/files.go b/cmd/files.go index 568b2fb..ba1d63f 100644 --- a/cmd/files.go +++ b/cmd/files.go @@ -33,7 +33,7 @@ whose name is -rollup-.md.`, func init() { filesCmd.Flags().StringVarP(&path, "path", "p", ".", "Path to the project directory") - filesCmd.Flags().StringVarP(&fileTypes, "types", "t", ".go,.md,.txt", "Comma-separated list of file extensions to include") + filesCmd.Flags().StringVarP(&fileTypes, "types", "t", "go,md,txt", "Comma-separated list of file extensions to include (without leading dot)") filesCmd.Flags().StringVarP(&codeGenPatterns, "codegen", "g", "", "Comma-separated list of glob patterns for code-generated files") filesCmd.Flags().StringVarP(&ignorePatterns, "ignore", "i", "", "Comma-separated list of glob patterns for files to ignore") } diff --git a/internal/config/config.go b/internal/config/config.go index e6a3ba0..f3e67a6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -88,6 +88,10 @@ func Load(configPath string) (*Config, error) { // Validate checks the configuration for any invalid values func (c *Config) Validate() error { + if len(c.FileExtensions) == 0 && len(c.Sites) == 0 { + return fmt.Errorf("file_extensions or sites must be specified") + } + if c.RequestsPerSecond != nil && *c.RequestsPerSecond <= 0 { return fmt.Errorf("requests_per_second must be positive") } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 400fcc3..77a36fd 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -10,8 +10,8 @@ func TestLoad(t *testing.T) { // Create a temporary config file content := []byte(` file_extensions: - - .go - - .md + - go + - md ignore_paths: - "*.tmp" - "**/*.log" @@ -27,7 +27,7 @@ sites: - "/blog" exclude_paths: - "/admin" - file_name_prefix: "example" + file_name_prefix: "example" path_overrides: - path: "/special" css_locator: ".special-content" @@ -61,7 +61,7 @@ burst_limit: 5 rps := 1.0 bl := 5 expectedConfig := &Config{ - FileExtensions: []string{".go", ".md"}, + FileExtensions: []string{"go", "md"}, IgnorePaths: []string{"*.tmp", "**/*.log"}, CodeGeneratedPaths: []string{"generated_*.go"}, Sites: []SiteConfig{ @@ -100,7 +100,7 @@ func TestValidate(t *testing.T) { { name: "Valid config", config: Config{ - FileExtensions: []string{".go"}, + FileExtensions: []string{"go"}, Sites: []SiteConfig{ {BaseURL: "https://example.com"}, }, @@ -115,7 +115,7 @@ func TestValidate(t *testing.T) { { name: "Invalid requests per second", config: Config{ - FileExtensions: []string{".go"}, + FileExtensions: []string{"go"}, RequestsPerSecond: func() *float64 { f := -1.0; return &f }(), }, wantErr: true, @@ -123,7 +123,7 @@ func TestValidate(t *testing.T) { { name: "Invalid burst limit", config: Config{ - FileExtensions: []string{".go"}, + FileExtensions: []string{"go"}, BurstLimit: func() *int { i := -1; return &i }(), }, wantErr: true, @@ -131,19 +131,11 @@ func TestValidate(t *testing.T) { { name: "Site without base URL", config: Config{ - FileExtensions: []string{".go"}, + FileExtensions: []string{"go"}, Sites: []SiteConfig{{}}, }, wantErr: true, }, - { - name: "Negative max depth", - config: Config{ - FileExtensions: []string{".go"}, - Sites: []SiteConfig{{BaseURL: "https://example.com"}}, - }, - wantErr: true, - }, } for _, tt := range tests { diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index ec02650..2776d56 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -74,6 +74,9 @@ func ScrapeSites(config Config) error { var wg sync.WaitGroup totalURLs := 0 + for _, site := range config.Sites { + totalURLs += len(site.AllowedPaths) + } for _, site := range config.Sites { logger.Printf("Processing site: %s\n", site.BaseURL) wg.Add(1) @@ -81,7 +84,6 @@ func ScrapeSites(config Config) error { defer wg.Done() for _, path := range site.AllowedPaths { fullURL := site.BaseURL + path - totalURLs++ logger.Printf("Queueing URL for scraping: %s\n", fullURL) scrapeSingleURL(fullURL, site, results, limiter) } @@ -532,8 +534,6 @@ func scrollPage(page playwright.Page) error { () => { window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight; - // wait for 500 ms - new Promise(resolve => setTimeout(resolve, 500)); } ` @@ -565,8 +565,8 @@ func scrollPage(page playwright.Page) error { previousHeight = currentHeight - // Wait for a while before scrolling again - + // Wait for content to load before scrolling again + time.Sleep(100 * time.Millisecond) } logger.Println("Scrolling back to top") diff --git a/main.go b/main.go index e8ad8a0..4127f4b 100644 --- a/main.go +++ b/main.go @@ -10,8 +10,6 @@ import ( "github.com/tnypxl/rollup/internal/scraper" ) -var cfg *config.Config - func main() { // Check if the command is "help" isHelpCommand := len(os.Args) > 1 && (os.Args[1] == "help" || os.Args[1] == "--help" || os.Args[1] == "-h")