From 09608cf07307abfd1b49acb21e6a9075eca570cc Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 27 Nov 2025 15:56:37 +0000 Subject: [PATCH 1/2] fix: resolve 5 bugs identified in code review - Fix malformed YAML in config_test.go (incorrect indentation) - Add validation for empty file_extensions in Config.Validate() - Remove obsolete max_depth test case (field no longer exists) - Remove unused global cfg variable in main.go - Fix race condition in ScrapeSites by counting URLs before goroutines - Remove unreachable JavaScript code in scroll script, add proper delay - Standardize file extensions to not include leading dot --- cmd/files.go | 2 +- internal/config/config.go | 4 ++++ internal/config/config_test.go | 24 ++++++++---------------- internal/scraper/scraper.go | 10 +++++----- main.go | 2 -- 5 files changed, 18 insertions(+), 24 deletions(-) diff --git a/cmd/files.go b/cmd/files.go index 568b2fb..ba1d63f 100644 --- a/cmd/files.go +++ b/cmd/files.go @@ -33,7 +33,7 @@ whose name is -rollup-.md.`, func init() { filesCmd.Flags().StringVarP(&path, "path", "p", ".", "Path to the project directory") - filesCmd.Flags().StringVarP(&fileTypes, "types", "t", ".go,.md,.txt", "Comma-separated list of file extensions to include") + filesCmd.Flags().StringVarP(&fileTypes, "types", "t", "go,md,txt", "Comma-separated list of file extensions to include (without leading dot)") filesCmd.Flags().StringVarP(&codeGenPatterns, "codegen", "g", "", "Comma-separated list of glob patterns for code-generated files") filesCmd.Flags().StringVarP(&ignorePatterns, "ignore", "i", "", "Comma-separated list of glob patterns for files to ignore") } diff --git a/internal/config/config.go b/internal/config/config.go index e6a3ba0..f3e67a6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -88,6 +88,10 @@ func Load(configPath string) (*Config, error) { // Validate checks the configuration for any invalid values func (c *Config) Validate() error { + if len(c.FileExtensions) == 0 && len(c.Sites) == 0 { + return fmt.Errorf("file_extensions or sites must be specified") + } + if c.RequestsPerSecond != nil && *c.RequestsPerSecond <= 0 { return fmt.Errorf("requests_per_second must be positive") } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 400fcc3..77a36fd 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -10,8 +10,8 @@ func TestLoad(t *testing.T) { // Create a temporary config file content := []byte(` file_extensions: - - .go - - .md + - go + - md ignore_paths: - "*.tmp" - "**/*.log" @@ -27,7 +27,7 @@ sites: - "/blog" exclude_paths: - "/admin" - file_name_prefix: "example" + file_name_prefix: "example" path_overrides: - path: "/special" css_locator: ".special-content" @@ -61,7 +61,7 @@ burst_limit: 5 rps := 1.0 bl := 5 expectedConfig := &Config{ - FileExtensions: []string{".go", ".md"}, + FileExtensions: []string{"go", "md"}, IgnorePaths: []string{"*.tmp", "**/*.log"}, CodeGeneratedPaths: []string{"generated_*.go"}, Sites: []SiteConfig{ @@ -100,7 +100,7 @@ func TestValidate(t *testing.T) { { name: "Valid config", config: Config{ - FileExtensions: []string{".go"}, + FileExtensions: []string{"go"}, Sites: []SiteConfig{ {BaseURL: "https://example.com"}, }, @@ -115,7 +115,7 @@ func TestValidate(t *testing.T) { { name: "Invalid requests per second", config: Config{ - FileExtensions: []string{".go"}, + FileExtensions: []string{"go"}, RequestsPerSecond: func() *float64 { f := -1.0; return &f }(), }, wantErr: true, @@ -123,7 +123,7 @@ func TestValidate(t *testing.T) { { name: "Invalid burst limit", config: Config{ - FileExtensions: []string{".go"}, + FileExtensions: []string{"go"}, BurstLimit: func() *int { i := -1; return &i }(), }, wantErr: true, @@ -131,19 +131,11 @@ func TestValidate(t *testing.T) { { name: "Site without base URL", config: Config{ - FileExtensions: []string{".go"}, + FileExtensions: []string{"go"}, Sites: []SiteConfig{{}}, }, wantErr: true, }, - { - name: "Negative max depth", - config: Config{ - FileExtensions: []string{".go"}, - Sites: []SiteConfig{{BaseURL: "https://example.com"}}, - }, - wantErr: true, - }, } for _, tt := range tests { diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index ec02650..2776d56 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -74,6 +74,9 @@ func ScrapeSites(config Config) error { var wg sync.WaitGroup totalURLs := 0 + for _, site := range config.Sites { + totalURLs += len(site.AllowedPaths) + } for _, site := range config.Sites { logger.Printf("Processing site: %s\n", site.BaseURL) wg.Add(1) @@ -81,7 +84,6 @@ func ScrapeSites(config Config) error { defer wg.Done() for _, path := range site.AllowedPaths { fullURL := site.BaseURL + path - totalURLs++ logger.Printf("Queueing URL for scraping: %s\n", fullURL) scrapeSingleURL(fullURL, site, results, limiter) } @@ -532,8 +534,6 @@ func scrollPage(page playwright.Page) error { () => { window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight; - // wait for 500 ms - new Promise(resolve => setTimeout(resolve, 500)); } ` @@ -565,8 +565,8 @@ func scrollPage(page playwright.Page) error { previousHeight = currentHeight - // Wait for a while before scrolling again - + // Wait for content to load before scrolling again + time.Sleep(100 * time.Millisecond) } logger.Println("Scrolling back to top") diff --git a/main.go b/main.go index e8ad8a0..4127f4b 100644 --- a/main.go +++ b/main.go @@ -10,8 +10,6 @@ import ( "github.com/tnypxl/rollup/internal/scraper" ) -var cfg *config.Config - func main() { // Check if the command is "help" isHelpCommand := len(os.Args) > 1 && (os.Args[1] == "help" || os.Args[1] == "--help" || os.Args[1] == "-h") From ff1301240893d452a33c877be2bc35aceab37c21 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 27 Nov 2025 16:05:42 +0000 Subject: [PATCH 2/2] fix: address functionality gaps identified in code review - Wire up --config/-f flag to actually load custom config files - Move config loading to PersistentPreRunE in root.go - Simplify main.go to just call cmd.Execute() - Move Playwright init to web command's PreRunE/PostRunE - Remove unused functions from cmd/web.go (~90 lines of dead code) - Remove writeSingleFile, writeMultipleFiles, generateDefaultFilename - Remove scrapeURL, extractAndConvertContent, testExtractAndConvertContent - Remove unused mock function from web_test.go - Add OutputType validation to Config.Validate() - Only allow "single", "separate", or empty string - Add test cases for valid and invalid output types --- cmd/root.go | 32 ++++++++-- cmd/web.go | 103 +++++---------------------------- cmd/web_test.go | 5 -- internal/config/config.go | 4 ++ internal/config/config_test.go | 24 ++++++++ main.go | 29 +--------- 6 files changed, 68 insertions(+), 129 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 43f2890..500d0b3 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -1,8 +1,10 @@ package cmd import ( + "log" + "github.com/spf13/cobra" - config "github.com/tnypxl/rollup/internal/config" + "github.com/tnypxl/rollup/internal/config" ) var ( @@ -15,13 +17,31 @@ var rootCmd = &cobra.Command{ Short: "Rollup is a tool for combining and processing files", Long: `Rollup is a versatile tool that can combine and process files in various ways. Use subcommands to perform specific operations.`, + PersistentPreRunE: func(cmd *cobra.Command, args []string) error { + // Skip config loading for generate and help commands + if cmd.Name() == "generate" || cmd.Name() == "help" { + return nil + } + + // Determine config path + configPath := configFile + if configPath == "" { + configPath = "rollup.yml" + } + + // Load configuration + var err error + cfg, err = config.Load(configPath) + if err != nil { + log.Printf("Warning: Failed to load configuration from %s: %v", configPath, err) + cfg = &config.Config{} // Use empty config if loading fails + } + + return nil + }, } -func Execute(conf *config.Config) error { - if conf == nil { - conf = &config.Config{} // Use an empty config if none is provided - } - cfg = conf // Set the cfg variable in cmd/files.go +func Execute() error { return rootCmd.Execute() } diff --git a/cmd/web.go b/cmd/web.go index 81b2ebe..7b67a96 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -28,7 +28,19 @@ var webCmd = &cobra.Command{ Use: "web", Short: "Scrape main content from webpages and convert to Markdown", Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`, - RunE: runWeb, + PreRunE: func(cmd *cobra.Command, args []string) error { + // Initialize Playwright for web scraping + if err := scraper.InitPlaywright(); err != nil { + return fmt.Errorf("failed to initialize Playwright: %w", err) + } + return nil + }, + RunE: runWeb, + PostRunE: func(cmd *cobra.Command, args []string) error { + // Clean up Playwright resources + scraper.ClosePlaywright() + return nil + }, } func init() { @@ -144,95 +156,6 @@ func runWeb(cmd *cobra.Command, args []string) error { return nil } -func writeSingleFile(content map[string]string) error { - outputFile := generateDefaultFilename() - file, err := os.Create(outputFile) - if err != nil { - return fmt.Errorf("error creating output file: %v", err) - } - defer file.Close() - - for url, c := range content { - _, err = fmt.Fprintf(file, "# ::: Content from %s\n\n%s\n\n---\n\n", url, c) - if err != nil { - return fmt.Errorf("error writing content to file: %v", err) - } - } - - fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile) - return nil -} - -func writeMultipleFiles(content map[string]string) error { - for url, c := range content { - filename, err := getFilenameFromContent(c, url) - if err != nil { - return fmt.Errorf("error generating filename for %s: %v", url, err) - } - - file, err := os.Create(filename) - if err != nil { - return fmt.Errorf("error creating output file %s: %v", filename, err) - } - - _, err = file.WriteString(fmt.Sprintf("# ::: Content from %s\n\n%s\n", url, c)) - if err != nil { - file.Close() - return fmt.Errorf("error writing content to file %s: %v", filename, err) - } - - file.Close() - fmt.Printf("Content from %s has been saved to %s\n", url, filename) - } - - return nil -} - -func generateDefaultFilename() string { - timestamp := time.Now().Format("20060102-150405") - return fmt.Sprintf("web-%s.rollup.md", timestamp) -} - -func scrapeURL(urlStr string) (string, error) { - content, err := testExtractAndConvertContent(urlStr) - if err != nil { - return "", err - } - - return content, nil -} - -var ( - testExtractAndConvertContent = extractAndConvertContent -) - -func extractAndConvertContent(urlStr string) (string, error) { - content, err := scraper.FetchWebpageContent(urlStr) - if err != nil { - return "", fmt.Errorf("error fetching webpage content: %v", err) - } - - if includeSelector != "" { - content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors) - if err != nil { - return "", fmt.Errorf("error extracting content with CSS: %v", err) - } - } - - markdown, err := scraper.ProcessHTMLContent(content, scraper.Config{}) - if err != nil { - return "", fmt.Errorf("error processing HTML content: %v", err) - } - - parsedURL, err := url.Parse(urlStr) - if err != nil { - return "", fmt.Errorf("error parsing URL: %v", err) - } - header := fmt.Sprintf("# ::: Content from %s\n\n", parsedURL.String()) - - return header + markdown + "\n\n", nil -} - func getFilenameFromContent(content, urlStr string) (string, error) { // Try to extract title from content titleStart := strings.Index(content, "") diff --git a/cmd/web_test.go b/cmd/web_test.go index 79972a9..e18e0bc 100644 --- a/cmd/web_test.go +++ b/cmd/web_test.go @@ -97,8 +97,3 @@ func TestGetFilenameFromContent(t *testing.T) { } } } - -// Mock functions for testing -func mockExtractAndConvertContent(urlStr string) (string, error) { - return "Mocked content for " + urlStr, nil -} diff --git a/internal/config/config.go b/internal/config/config.go index f3e67a6..cfac4d4 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -92,6 +92,10 @@ func (c *Config) Validate() error { return fmt.Errorf("file_extensions or sites must be specified") } + if c.OutputType != "" && c.OutputType != "single" && c.OutputType != "separate" { + return fmt.Errorf("output_type must be 'single' or 'separate'") + } + if c.RequestsPerSecond != nil && *c.RequestsPerSecond <= 0 { return fmt.Errorf("requests_per_second must be positive") } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 77a36fd..cad82e1 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -136,6 +136,30 @@ func TestValidate(t *testing.T) { }, wantErr: true, }, + { + name: "Valid output type single", + config: Config{ + FileExtensions: []string{"go"}, + OutputType: "single", + }, + wantErr: false, + }, + { + name: "Valid output type separate", + config: Config{ + FileExtensions: []string{"go"}, + OutputType: "separate", + }, + wantErr: false, + }, + { + name: "Invalid output type", + config: Config{ + FileExtensions: []string{"go"}, + OutputType: "invalid", + }, + wantErr: true, + }, } for _, tt := range tests { diff --git a/main.go b/main.go index 4127f4b..3519178 100644 --- a/main.go +++ b/main.go @@ -2,40 +2,13 @@ package main import ( "fmt" - "log" "os" "github.com/tnypxl/rollup/cmd" - "github.com/tnypxl/rollup/internal/config" - "github.com/tnypxl/rollup/internal/scraper" ) func main() { - // Check if the command is "help" - isHelpCommand := len(os.Args) > 1 && (os.Args[1] == "help" || os.Args[1] == "--help" || os.Args[1] == "-h") - - var cfg *config.Config - var err error - - if !isHelpCommand { - configPath := "rollup.yml" - cfg, err = config.Load(configPath) - if err != nil { - log.Printf("Warning: Failed to load configuration: %v", err) - // Continue execution without a config file - } - - // Initialize the scraper logger with default verbosity (false) - scraper.SetupLogger(false) - - err = scraper.InitPlaywright() - if err != nil { - log.Fatalf("Failed to initialize Playwright: %v", err) - } - defer scraper.ClosePlaywright() - } - - if err := cmd.Execute(cfg); err != nil { + if err := cmd.Execute(); err != nil { fmt.Println(err) os.Exit(1) }