diff --git a/cmd/root.go b/cmd/root.go index 43f2890..500d0b3 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -1,8 +1,10 @@ package cmd import ( + "log" + "github.com/spf13/cobra" - config "github.com/tnypxl/rollup/internal/config" + "github.com/tnypxl/rollup/internal/config" ) var ( @@ -15,13 +17,31 @@ var rootCmd = &cobra.Command{ Short: "Rollup is a tool for combining and processing files", Long: `Rollup is a versatile tool that can combine and process files in various ways. Use subcommands to perform specific operations.`, + PersistentPreRunE: func(cmd *cobra.Command, args []string) error { + // Skip config loading for generate and help commands + if cmd.Name() == "generate" || cmd.Name() == "help" { + return nil + } + + // Determine config path + configPath := configFile + if configPath == "" { + configPath = "rollup.yml" + } + + // Load configuration + var err error + cfg, err = config.Load(configPath) + if err != nil { + log.Printf("Warning: Failed to load configuration from %s: %v", configPath, err) + cfg = &config.Config{} // Use empty config if loading fails + } + + return nil + }, } -func Execute(conf *config.Config) error { - if conf == nil { - conf = &config.Config{} // Use an empty config if none is provided - } - cfg = conf // Set the cfg variable in cmd/files.go +func Execute() error { return rootCmd.Execute() } diff --git a/cmd/web.go b/cmd/web.go index 81b2ebe..7b67a96 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -28,7 +28,19 @@ var webCmd = &cobra.Command{ Use: "web", Short: "Scrape main content from webpages and convert to Markdown", Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`, - RunE: runWeb, + PreRunE: func(cmd *cobra.Command, args []string) error { + // Initialize Playwright for web scraping + if err := scraper.InitPlaywright(); err != nil { + return fmt.Errorf("failed to initialize Playwright: %w", err) + } + return nil + }, + RunE: runWeb, + PostRunE: func(cmd *cobra.Command, args []string) error { + // Clean up Playwright resources + scraper.ClosePlaywright() + return nil + }, } func init() { @@ -144,95 +156,6 @@ func runWeb(cmd *cobra.Command, args []string) error { return nil } -func writeSingleFile(content map[string]string) error { - outputFile := generateDefaultFilename() - file, err := os.Create(outputFile) - if err != nil { - return fmt.Errorf("error creating output file: %v", err) - } - defer file.Close() - - for url, c := range content { - _, err = fmt.Fprintf(file, "# ::: Content from %s\n\n%s\n\n---\n\n", url, c) - if err != nil { - return fmt.Errorf("error writing content to file: %v", err) - } - } - - fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile) - return nil -} - -func writeMultipleFiles(content map[string]string) error { - for url, c := range content { - filename, err := getFilenameFromContent(c, url) - if err != nil { - return fmt.Errorf("error generating filename for %s: %v", url, err) - } - - file, err := os.Create(filename) - if err != nil { - return fmt.Errorf("error creating output file %s: %v", filename, err) - } - - _, err = file.WriteString(fmt.Sprintf("# ::: Content from %s\n\n%s\n", url, c)) - if err != nil { - file.Close() - return fmt.Errorf("error writing content to file %s: %v", filename, err) - } - - file.Close() - fmt.Printf("Content from %s has been saved to %s\n", url, filename) - } - - return nil -} - -func generateDefaultFilename() string { - timestamp := time.Now().Format("20060102-150405") - return fmt.Sprintf("web-%s.rollup.md", timestamp) -} - -func scrapeURL(urlStr string) (string, error) { - content, err := testExtractAndConvertContent(urlStr) - if err != nil { - return "", err - } - - return content, nil -} - -var ( - testExtractAndConvertContent = extractAndConvertContent -) - -func extractAndConvertContent(urlStr string) (string, error) { - content, err := scraper.FetchWebpageContent(urlStr) - if err != nil { - return "", fmt.Errorf("error fetching webpage content: %v", err) - } - - if includeSelector != "" { - content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors) - if err != nil { - return "", fmt.Errorf("error extracting content with CSS: %v", err) - } - } - - markdown, err := scraper.ProcessHTMLContent(content, scraper.Config{}) - if err != nil { - return "", fmt.Errorf("error processing HTML content: %v", err) - } - - parsedURL, err := url.Parse(urlStr) - if err != nil { - return "", fmt.Errorf("error parsing URL: %v", err) - } - header := fmt.Sprintf("# ::: Content from %s\n\n", parsedURL.String()) - - return header + markdown + "\n\n", nil -} - func getFilenameFromContent(content, urlStr string) (string, error) { // Try to extract title from content titleStart := strings.Index(content, "") diff --git a/cmd/web_test.go b/cmd/web_test.go index 79972a9..e18e0bc 100644 --- a/cmd/web_test.go +++ b/cmd/web_test.go @@ -97,8 +97,3 @@ func TestGetFilenameFromContent(t *testing.T) { } } } - -// Mock functions for testing -func mockExtractAndConvertContent(urlStr string) (string, error) { - return "Mocked content for " + urlStr, nil -} diff --git a/internal/config/config.go b/internal/config/config.go index f3e67a6..cfac4d4 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -92,6 +92,10 @@ func (c *Config) Validate() error { return fmt.Errorf("file_extensions or sites must be specified") } + if c.OutputType != "" && c.OutputType != "single" && c.OutputType != "separate" { + return fmt.Errorf("output_type must be 'single' or 'separate'") + } + if c.RequestsPerSecond != nil && *c.RequestsPerSecond <= 0 { return fmt.Errorf("requests_per_second must be positive") } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 77a36fd..cad82e1 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -136,6 +136,30 @@ func TestValidate(t *testing.T) { }, wantErr: true, }, + { + name: "Valid output type single", + config: Config{ + FileExtensions: []string{"go"}, + OutputType: "single", + }, + wantErr: false, + }, + { + name: "Valid output type separate", + config: Config{ + FileExtensions: []string{"go"}, + OutputType: "separate", + }, + wantErr: false, + }, + { + name: "Invalid output type", + config: Config{ + FileExtensions: []string{"go"}, + OutputType: "invalid", + }, + wantErr: true, + }, } for _, tt := range tests { diff --git a/main.go b/main.go index 4127f4b..3519178 100644 --- a/main.go +++ b/main.go @@ -2,40 +2,13 @@ package main import ( "fmt" - "log" "os" "github.com/tnypxl/rollup/cmd" - "github.com/tnypxl/rollup/internal/config" - "github.com/tnypxl/rollup/internal/scraper" ) func main() { - // Check if the command is "help" - isHelpCommand := len(os.Args) > 1 && (os.Args[1] == "help" || os.Args[1] == "--help" || os.Args[1] == "-h") - - var cfg *config.Config - var err error - - if !isHelpCommand { - configPath := "rollup.yml" - cfg, err = config.Load(configPath) - if err != nil { - log.Printf("Warning: Failed to load configuration: %v", err) - // Continue execution without a config file - } - - // Initialize the scraper logger with default verbosity (false) - scraper.SetupLogger(false) - - err = scraper.InitPlaywright() - if err != nil { - log.Fatalf("Failed to initialize Playwright: %v", err) - } - defer scraper.ClosePlaywright() - } - - if err := cmd.Execute(cfg); err != nil { + if err := cmd.Execute(); err != nil { fmt.Println(err) os.Exit(1) }