diff --git a/README.md b/README.md index dc9f8e4..eb095ce 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Rollup aggregates the contents of text-based files and webpages into a markdown - Flexible configuration file support (YAML) - Automatic generation of default configuration file - Custom output file naming -- Concurrent processing for improved performance +- Rate limiting for web scraping to respect server resources ## Installation @@ -66,37 +66,36 @@ Rollup can be configured using a YAML file. By default, it looks for `rollup.yml Example `rollup.yml`: ```yaml -file_types: +file_extensions: - go - md -ignore: +ignore_paths: - node_modules/** - vendor/** - .git/** -code_generated: +code_generated_paths: - **/generated/** -scrape: - sites: - - base_url: https://example.com - css_locator: .content - exclude_selectors: - - .ads - - .navigation - max_depth: 2 - allowed_paths: - - /blog - - /docs - exclude_paths: - - /admin - output_alias: example - path_overrides: - - path: /special-page - css_locator: .special-content - exclude_selectors: - - .special-ads - output_type: single - requests_per_second: 1.0 - burst_limit: 3 +sites: + - base_url: https://example.com + css_locator: .content + exclude_selectors: + - .ads + - .navigation + max_depth: 2 + allowed_paths: + - /blog + - /docs + exclude_paths: + - /admin + output_alias: example + path_overrides: + - path: /special-page + css_locator: .special-content + exclude_selectors: + - .special-ads +output_type: single +requests_per_second: 1.0 +burst_limit: 3 ``` ## Examples @@ -107,10 +106,10 @@ scrape: rollup files ``` -2. Web scraping with multiple URLs and increased concurrency: +2. Web scraping with multiple URLs: ```bash - rollup web --urls=https://example.com,https://another-example.com --concurrent=8 + rollup web --urls=https://example.com,https://another-example.com ``` 3. Generate a default configuration file: @@ -119,20 +118,22 @@ scrape: rollup generate ``` -4. Use a custom configuration file and specify output: +4. Use a custom configuration file: ```bash - rollup files --config=my-config.yml --output=project_summary.md + rollup files --config=my-config.yml ``` -5. Web scraping with separate output files and custom timeout: +5. Web scraping with separate output files: + ```bash - rollup web --urls=https://example.com,https://another-example.com --output=separate --timeout=60 + rollup web --urls=https://example.com,https://another-example.com --output=separate ``` 6. Rollup files with specific types and ignore patterns: + ```bash - rollup files --types=.go,.md --ignore=vendor/**,*_test.go + rollup files --types=go,md --ignore=vendor/**,*_test.go ``` 7. Web scraping with depth and CSS selector: diff --git a/cmd/files.go b/cmd/files.go index 70ae3b6..568b2fb 100644 --- a/cmd/files.go +++ b/cmd/files.go @@ -108,19 +108,20 @@ func isIgnored(filePath string, patterns []string) bool { func runRollup(cfg *config.Config) error { // Use config if available, otherwise use command-line flags - var types, codeGenList, ignoreList []string - if cfg != nil && len(cfg.FileTypes) > 0 { - types = cfg.FileTypes + var types []string + var codeGenList, ignoreList []string + if cfg != nil && len(cfg.FileExtensions) > 0 { + types = cfg.FileExtensions } else { types = strings.Split(fileTypes, ",") } - if cfg != nil && len(cfg.CodeGenerated) > 0 { - codeGenList = cfg.CodeGenerated + if cfg != nil && len(cfg.CodeGeneratedPaths) > 0 { + codeGenList = cfg.CodeGeneratedPaths } else { codeGenList = strings.Split(codeGenPatterns, ",") } - if cfg != nil && cfg.Ignore != nil && len(cfg.Ignore) > 0 { - ignoreList = cfg.Ignore + if cfg != nil && len(cfg.IgnorePaths) > 0 { + ignoreList = cfg.IgnorePaths } else { ignoreList = strings.Split(ignorePatterns, ",") } @@ -145,6 +146,11 @@ func runRollup(cfg *config.Config) error { } defer outputFile.Close() + startTime := time.Now() + showProgress := false + progressTicker := time.NewTicker(500 * time.Millisecond) + defer progressTicker.Stop() + // Walk through the directory err = filepath.Walk(absPath, func(path string, info os.FileInfo, err error) error { if err != nil { @@ -160,16 +166,25 @@ func runRollup(cfg *config.Config) error { // Check if the file should be ignored if isIgnored(relPath, ignoreList) { + if verbose { + fmt.Printf("Ignoring file: %s\n", relPath) + } return nil } ext := filepath.Ext(path) for _, t := range types { if ext == "."+t { + // Verbose logging for processed file + if verbose { + size := humanReadableSize(info.Size()) + fmt.Printf("Processing file: %s (%s)\n", relPath, size) + } + // Read file contents content, err := os.ReadFile(path) if err != nil { - fmt.Printf("Error reading file %s: %v", path, err) + fmt.Printf("Error reading file %s: %v\n", path, err) return nil } @@ -185,12 +200,43 @@ func runRollup(cfg *config.Config) error { break } } + + if !showProgress && time.Since(startTime) > 5*time.Second { + showProgress = true + fmt.Print("This is taking a while (hold tight) ") + } + + select { + case <-progressTicker.C: + if showProgress { + fmt.Print(".") + } + default: + } + return nil }) if err != nil { return fmt.Errorf("error walking through directory: %v", err) } - fmt.Printf("Rollup complete. Output file: %s", outputFileName) + if showProgress { + fmt.Println() // Print a newline after the progress dots + } + + fmt.Printf("Rollup complete. Output file: %s\n", outputFileName) return nil } + +func humanReadableSize(size int64) string { + const unit = 1024 + if size < unit { + return fmt.Sprintf("%d B", size) + } + div, exp := int64(unit), 0 + for n := size / unit; n >= unit; n /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %cB", float64(size)/float64(div), "KMGTPE"[exp]) +} diff --git a/cmd/files_test.go b/cmd/files_test.go index 13ef72f..3a1a210 100644 --- a/cmd/files_test.go +++ b/cmd/files_test.go @@ -112,9 +112,9 @@ func TestRunRollup(t *testing.T) { // Set up test configuration cfg = &config.Config{ - FileTypes: []string{"go", "txt", "md"}, - Ignore: []string{"*.json", ".git/**", "vendor/**"}, - CodeGenerated: []string{"generated_*.go"}, + FileExtensions: []string{"go", "txt", "md"}, + IgnorePaths: []string{"*.json", ".git/**", "vendor/**"}, + CodeGeneratedPaths: []string{"generated_*.go"}, } // Change working directory to the temp directory diff --git a/cmd/generate.go b/cmd/generate.go index a872753..e8805b8 100644 --- a/cmd/generate.go +++ b/cmd/generate.go @@ -38,23 +38,23 @@ func runGenerate(cmd *cobra.Command, args []string) error { } cfg := config.Config{ - FileTypes: make([]string, 0, len(fileTypes)), - Ignore: []string{"node_modules/**", "vendor/**", ".git/**"}, + FileExtensions: make([]string, 0, len(fileTypes)), + IgnorePaths: []string{"node_modules/**", "vendor/**", ".git/**"}, } for ext := range fileTypes { - cfg.FileTypes = append(cfg.FileTypes, ext) + cfg.FileExtensions = append(cfg.FileExtensions, ext) } // Sort file types for consistency - sort.Strings(cfg.FileTypes) + sort.Strings(cfg.FileExtensions) yamlData, err := yaml.Marshal(&cfg) if err != nil { return fmt.Errorf("error marshaling config: %v", err) } - outputPath := config.DefaultConfigPath() + outputPath := "rollup.yml" err = os.WriteFile(outputPath, yamlData, 0644) if err != nil { return fmt.Errorf("error writing config file: %v", err) diff --git a/cmd/web.go b/cmd/web.go index ecc24d4..c73e32c 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -18,7 +18,6 @@ import ( var ( urls []string outputType string - depth int includeSelector string excludeSelectors []string ) @@ -35,7 +34,6 @@ var webCmd = &cobra.Command{ func init() { webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") webCmd.Flags().StringVarP(&outputType, "output", "o", "single", "Output type: 'single' for one file, 'separate' for multiple files") - webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)") webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content") webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)") } @@ -50,22 +48,21 @@ func runWeb(cmd *cobra.Command, args []string) error { scraperConfig.Verbose = verbose var siteConfigs []scraper.SiteConfig - if len(cfg.Scrape.Sites) > 0 { - logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites)) - siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites)) - for i, site := range cfg.Scrape.Sites { + if len(cfg.Sites) > 0 { + logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Sites)) + siteConfigs = make([]scraper.SiteConfig, len(cfg.Sites)) + for i, site := range cfg.Sites { siteConfigs[i] = scraper.SiteConfig{ BaseURL: site.BaseURL, CSSLocator: site.CSSLocator, ExcludeSelectors: site.ExcludeSelectors, - MaxDepth: site.MaxDepth, AllowedPaths: site.AllowedPaths, ExcludePaths: site.ExcludePaths, OutputAlias: site.OutputAlias, PathOverrides: convertPathOverrides(site.PathOverrides), } - logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d, AllowedPaths=%v", - i+1, site.BaseURL, site.CSSLocator, site.MaxDepth, site.AllowedPaths) + logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, AllowedPaths=%v", + i+1, site.BaseURL, site.CSSLocator, site.AllowedPaths) } } else { logger.Printf("No sites defined in rollup.yml, falling back to URL-based configuration") @@ -75,10 +72,9 @@ func runWeb(cmd *cobra.Command, args []string) error { BaseURL: u, CSSLocator: includeSelector, ExcludeSelectors: excludeSelectors, - MaxDepth: depth, } - logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d", - i+1, u, includeSelector, depth) + logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s", + i+1, u, includeSelector) } } @@ -92,13 +88,13 @@ func runWeb(cmd *cobra.Command, args []string) error { defaultBurstLimit := 3 // Use default values if not set in the configuration - requestsPerSecond := cfg.Scrape.RequestsPerSecond - if requestsPerSecond == 0 { - requestsPerSecond = defaultRequestsPerSecond + requestsPerSecond := defaultRequestsPerSecond + if cfg.RequestsPerSecond != nil { + requestsPerSecond = *cfg.RequestsPerSecond } - burstLimit := cfg.Scrape.BurstLimit - if burstLimit == 0 { - burstLimit = defaultBurstLimit + burstLimit := defaultBurstLimit + if cfg.BurstLimit != nil { + burstLimit = *cfg.BurstLimit } scraperConfig := scraper.Config{ @@ -114,7 +110,32 @@ func runWeb(cmd *cobra.Command, args []string) error { outputType, requestsPerSecond, burstLimit) logger.Println("Starting scraping process") + startTime := time.Now() + progressTicker := time.NewTicker(time.Second) + defer progressTicker.Stop() + + done := make(chan bool) + messagePrinted := false + go func() { + for { + select { + case <-progressTicker.C: + if time.Since(startTime) > 5*time.Second && !messagePrinted { + fmt.Print("This is taking a while (hold tight) ") + messagePrinted = true + } else if messagePrinted { + fmt.Print(".") + } + case <-done: + return + } + } + }() + scrapedContent, err := scraper.ScrapeSites(scraperConfig) + done <- true + fmt.Println() // New line after progress indicator + if err != nil { logger.Printf("Error occurred during scraping: %v", err) return fmt.Errorf("error scraping content: %v", err) @@ -179,45 +200,17 @@ func generateDefaultFilename() string { return fmt.Sprintf("web-%s.rollup.md", timestamp) } -func scrapeRecursively(urlStr string, depth int) (string, error) { - visited := make(map[string]bool) - return scrapeURL(urlStr, depth, visited) -} - -func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error) { - if depth < 0 || visited[urlStr] { - return "", nil - } - - visited[urlStr] = true - +func scrapeURL(urlStr string) (string, error) { content, err := testExtractAndConvertContent(urlStr) if err != nil { return "", err } - if depth > 0 { - links, err := testExtractLinks(urlStr) - if err != nil { - return content, fmt.Errorf("error extracting links: %v", err) - } - - for _, link := range links { - subContent, err := scrapeURL(link, depth-1, visited) - if err != nil { - fmt.Printf("Warning: Error scraping %s: %v\n", link, err) - continue - } - content += "\n\n---\n\n" + subContent - } - } - return content, nil } var ( testExtractAndConvertContent = extractAndConvertContent - testExtractLinks = scraper.ExtractLinks ) func extractAndConvertContent(urlStr string) (string, error) { diff --git a/internal/config/config.go b/internal/config/config.go index 0042396..3b57b5f 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -7,35 +7,64 @@ import ( "gopkg.in/yaml.v2" ) +// Config represents the configuration for the rollup tool type Config struct { - FileTypes []string `yaml:"file_types"` - Ignore []string `yaml:"ignore"` - CodeGenerated []string `yaml:"code_generated"` - Scrape ScrapeConfig `yaml:"scrape"` -} - -type ScrapeConfig struct { - Sites []SiteConfig `yaml:"sites"` - OutputType string `yaml:"output_type"` - RequestsPerSecond float64 `yaml:"requests_per_second"` - BurstLimit int `yaml:"burst_limit"` + // FileExtensions is a list of file extensions to include in the rollup + FileExtensions []string `yaml:"file_extensions"` + + // IgnorePaths is a list of glob patterns for paths to ignore + IgnorePaths []string `yaml:"ignore_paths"` + + // CodeGeneratedPaths is a list of glob patterns for code-generated files + CodeGeneratedPaths []string `yaml:"code_generated_paths"` + + // Sites is a list of site configurations for web scraping + Sites []SiteConfig `yaml:"sites"` + + // OutputType specifies how the output should be generated + OutputType string `yaml:"output_type"` + + // RequestsPerSecond limits the rate of web requests + RequestsPerSecond *float64 `yaml:"requests_per_second,omitempty"` + + // BurstLimit sets the maximum burst size for rate limiting + BurstLimit *int `yaml:"burst_limit,omitempty"` } +// SiteConfig contains configuration for scraping a single site type SiteConfig struct { - BaseURL string `yaml:"base_url"` - CSSLocator string `yaml:"css_locator"` - ExcludeSelectors []string `yaml:"exclude_selectors"` - MaxDepth int `yaml:"max_depth"` - AllowedPaths []string `yaml:"allowed_paths"` - ExcludePaths []string `yaml:"exclude_paths"` - OutputAlias string `yaml:"output_alias"` - PathOverrides []PathOverride `yaml:"path_overrides"` + // BaseURL is the starting point for scraping this site + BaseURL string `yaml:"base_url"` + + // CSSLocator is used to extract specific content + CSSLocator string `yaml:"css_locator"` + + // ExcludeSelectors lists CSS selectors for content to exclude + ExcludeSelectors []string `yaml:"exclude_selectors"` + + // AllowedPaths lists paths that are allowed to be scraped + AllowedPaths []string `yaml:"allowed_paths"` + + // ExcludePaths lists paths that should not be scraped + ExcludePaths []string `yaml:"exclude_paths"` + + // OutputAlias provides an alternative name for output files + OutputAlias string `yaml:"output_alias"` + + // PathOverrides allows for path-specific configurations + PathOverrides []PathOverride `yaml:"path_overrides"` } +// PathOverride allows for path-specific configurations type PathOverride struct { - Path string `yaml:"path"` - CSSLocator string `yaml:"css_locator"` - ExcludeSelectors []string `yaml:"exclude_selectors"` + // Path is the URL path this override applies to + Path string `yaml:"path"` + + // CSSLocator overrides the site-wide CSS locator for this path + CSSLocator string `yaml:"css_locator"` + + // ExcludeSelectors overrides the site-wide exclude selectors for this path + ExcludeSelectors []string `yaml:"exclude_selectors"` } func Load(configPath string) (*Config, error) { @@ -50,15 +79,28 @@ func Load(configPath string) (*Config, error) { return nil, fmt.Errorf("error parsing config file: %v", err) } + if err := config.Validate(); err != nil { + return nil, fmt.Errorf("invalid configuration: %v", err) + } + return &config, nil } -func DefaultConfigPath() string { - return "rollup.yml" -} +// Validate checks the configuration for any invalid values +func (c *Config) Validate() error { + if c.RequestsPerSecond != nil && *c.RequestsPerSecond <= 0 { + return fmt.Errorf("requests_per_second must be positive") + } -func FileExists(filename string) bool { - _, err := os.Stat(filename) - return err == nil -} + if c.BurstLimit != nil && *c.BurstLimit <= 0 { + return fmt.Errorf("burst_limit must be positive") + } + for _, site := range c.Sites { + if site.BaseURL == "" { + return fmt.Errorf("base_url must be specified for each site") + } + } + + return nil +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 483c333..8c1c0b8 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -9,34 +9,33 @@ import ( func TestLoad(t *testing.T) { // Create a temporary config file content := []byte(` -file_types: - - go - - md -ignore: +file_extensions: + - .go + - .md +ignore_paths: - "*.tmp" - "**/*.log" -code_generated: +code_generated_paths: - "generated_*.go" -scrape: - sites: - - base_url: "https://example.com" - css_locator: "main" - exclude_selectors: - - ".ads" - max_depth: 2 - allowed_paths: - - "/blog" - exclude_paths: - - "/admin" - output_alias: "example" - path_overrides: - - path: "/special" - css_locator: ".special-content" - exclude_selectors: - - ".sidebar" - output_type: "single" - requests_per_second: 1.0 - burst_limit: 5 +sites: + - base_url: "https://example.com" + css_locator: "main" + exclude_selectors: + - ".ads" + max_depth: 2 + allowed_paths: + - "/blog" + exclude_paths: + - "/admin" + output_alias: "example" + path_overrides: + - path: "/special" + css_locator: ".special-content" + exclude_selectors: + - ".sidebar" +output_type: "single" +requests_per_second: 1.0 +burst_limit: 5 `) tmpfile, err := os.CreateTemp("", "config*.yml") @@ -59,33 +58,33 @@ scrape: } // Check if the loaded config matches the expected values + rps := 1.0 + bl := 5 expectedConfig := &Config{ - FileTypes: []string{"go", "md"}, - Ignore: []string{"*.tmp", "**/*.log"}, - CodeGenerated: []string{"generated_*.go"}, - Scrape: ScrapeConfig{ - Sites: []SiteConfig{ - { - BaseURL: "https://example.com", - CSSLocator: "main", - ExcludeSelectors: []string{".ads"}, - MaxDepth: 2, - AllowedPaths: []string{"/blog"}, - ExcludePaths: []string{"/admin"}, - OutputAlias: "example", - PathOverrides: []PathOverride{ - { - Path: "/special", - CSSLocator: ".special-content", - ExcludeSelectors: []string{".sidebar"}, - }, + FileExtensions: []string{".go", ".md"}, + IgnorePaths: []string{"*.tmp", "**/*.log"}, + CodeGeneratedPaths: []string{"generated_*.go"}, + Sites: []SiteConfig{ + { + BaseURL: "https://example.com", + CSSLocator: "main", + ExcludeSelectors: []string{".ads"}, + MaxDepth: 2, + AllowedPaths: []string{"/blog"}, + ExcludePaths: []string{"/admin"}, + OutputAlias: "example", + PathOverrides: []PathOverride{ + { + Path: "/special", + CSSLocator: ".special-content", + ExcludeSelectors: []string{".sidebar"}, }, }, }, - OutputType: "single", - RequestsPerSecond: 1.0, - BurstLimit: 5, }, + OutputType: "single", + RequestsPerSecond: &rps, + BurstLimit: &bl, } if !reflect.DeepEqual(config, expectedConfig) { @@ -93,28 +92,67 @@ scrape: } } -func TestDefaultConfigPath(t *testing.T) { - expected := "rollup.yml" - result := DefaultConfigPath() - if result != expected { - t.Errorf("DefaultConfigPath() = %q, want %q", result, expected) - } -} - -func TestFileExists(t *testing.T) { - // Test with an existing file - tmpfile, err := os.CreateTemp("", "testfile") - if err != nil { - t.Fatalf("Failed to create temp file: %v", err) - } - defer os.Remove(tmpfile.Name()) - - if !FileExists(tmpfile.Name()) { - t.Errorf("FileExists(%q) = false, want true", tmpfile.Name()) - } - - // Test with a non-existing file - if FileExists("non_existing_file.txt") { - t.Errorf("FileExists(\"non_existing_file.txt\") = true, want false") +func TestValidate(t *testing.T) { + tests := []struct { + name string + config Config + wantErr bool + }{ + { + name: "Valid config", + config: Config{ + FileExtensions: []string{".go"}, + Sites: []SiteConfig{ + {BaseURL: "https://example.com", MaxDepth: 2}, + }, + }, + wantErr: false, + }, + { + name: "No file extensions", + config: Config{}, + wantErr: true, + }, + { + name: "Invalid requests per second", + config: Config{ + FileExtensions: []string{".go"}, + RequestsPerSecond: func() *float64 { f := -1.0; return &f }(), + }, + wantErr: true, + }, + { + name: "Invalid burst limit", + config: Config{ + FileExtensions: []string{".go"}, + BurstLimit: func() *int { i := -1; return &i }(), + }, + wantErr: true, + }, + { + name: "Site without base URL", + config: Config{ + FileExtensions: []string{".go"}, + Sites: []SiteConfig{{}}, + }, + wantErr: true, + }, + { + name: "Negative max depth", + config: Config{ + FileExtensions: []string{".go"}, + Sites: []SiteConfig{{BaseURL: "https://example.com", MaxDepth: -1}}, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.config.Validate() + if (err != nil) != tt.wantErr { + t.Errorf("Validate() error = %v, wantErr %v", err, tt.wantErr) + } + }) } } diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 1e145e5..3994df3 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -45,7 +45,6 @@ type SiteConfig struct { BaseURL string CSSLocator string ExcludeSelectors []string - MaxDepth int AllowedPaths []string ExcludePaths []string OutputAlias string @@ -156,57 +155,6 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { }{url, content, nil} } -func scrapeSite(site SiteConfig, results chan<- struct { - url string - content string - err error -}, limiter *rate.Limiter, -) { - visited := make(map[string]bool) - queue := []string{site.BaseURL} - - for len(queue) > 0 { - url := queue[0] - queue = queue[1:] - - if visited[url] { - continue - } - visited[url] = true - - if !isAllowedURL(url, site) { - continue - } - - // Wait for rate limiter before making the request - err := limiter.Wait(context.Background()) - if err != nil { - results <- struct { - url string - content string - err error - }{url, "", fmt.Errorf("rate limiter error: %v", err)} - continue - } - - cssLocator, excludeSelectors := getOverrides(url, site) - content, err := scrapeURL(url, cssLocator, excludeSelectors) - results <- struct { - url string - content string - err error - }{url, content, err} - - if len(visited) < site.MaxDepth { - links, _ := ExtractLinks(url) - for _, link := range links { - if !visited[link] && isAllowedURL(link, site) { - queue = append(queue, link) - } - } - } - } -} func isAllowedURL(urlStr string, site SiteConfig) bool { parsedURL, err := url.Parse(urlStr) @@ -510,40 +458,6 @@ func scrollPage(page playwright.Page) error { return nil } -// ExtractLinks extracts all links from the given URL -func ExtractLinks(urlStr string) ([]string, error) { - logger.Printf("Extracting links from URL: %s\n", urlStr) - - page, err := browser.NewPage() - if err != nil { - return nil, fmt.Errorf("could not create page: %v", err) - } - defer page.Close() - - if _, err = page.Goto(urlStr, playwright.PageGotoOptions{ - WaitUntil: playwright.WaitUntilStateNetworkidle, - }); err != nil { - return nil, fmt.Errorf("could not go to page: %v", err) - } - - links, err := page.Evaluate(`() => { - const anchors = document.querySelectorAll('a'); - return Array.from(anchors).map(a => a.href); - }`) - if err != nil { - return nil, fmt.Errorf("could not extract links: %v", err) - } - - var result []string - for _, link := range links.([]interface{}) { - // Normalize URL by removing trailing slash - normalizedLink := strings.TrimRight(link.(string), "/") - result = append(result, normalizedLink) - } - - logger.Printf("Extracted %d links\n", len(result)) - return result, nil -} // ExtractContentWithCSS extracts content from HTML using a CSS selector func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) { diff --git a/main.go b/main.go index 0f7d9c7..e8ad8a0 100644 --- a/main.go +++ b/main.go @@ -20,7 +20,7 @@ func main() { var err error if !isHelpCommand { - configPath := config.DefaultConfigPath() + configPath := "rollup.yml" cfg, err = config.Load(configPath) if err != nil { log.Printf("Warning: Failed to load configuration: %v", err)