diff --git a/cmd/web.go b/cmd/web.go index f2400c7..5029ec2 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -12,9 +12,9 @@ import ( ) var ( - urls []string - outputFile string - depth int + urls []string + outputType string + depth int includeSelector string excludeSelectors []string ) @@ -31,7 +31,7 @@ var webCmd = &cobra.Command{ func init() { rootCmd.AddCommand(webCmd) webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") - webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-.md)") + webCmd.Flags().StringVarP(&outputType, "output", "o", "single", "Output type: 'single' for one file, 'separate' for multiple files") webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)") webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content") webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)") @@ -39,44 +39,80 @@ func init() { func runWeb(cmd *cobra.Command, args []string) error { // Use config if available, otherwise use command-line flags - if len(urls) == 0 && cfg.Scrape.URL != "" { - urls = []string{cfg.Scrape.URL} + var urlConfigs []scraper.URLConfig + if len(urls) == 0 && len(cfg.Scrape.URLs) > 0 { + urlConfigs = make([]scraper.URLConfig, len(cfg.Scrape.URLs)) + for i, u := range cfg.Scrape.URLs { + urlConfigs[i] = scraper.URLConfig{ + URL: u.URL, + CSSLocator: u.CSSLocator, + OutputAlias: u.OutputAlias, + } + } + } else { + urlConfigs = make([]scraper.URLConfig, len(urls)) + for i, u := range urls { + urlConfigs[i] = scraper.URLConfig{URL: u, CSSLocator: includeSelector} + } } - if len(urls) == 0 { - return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file") + if len(urlConfigs) == 0 { + return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.urls' in the rollup.yml file") } - if outputFile == "" { - outputFile = generateDefaultFilename(urls) + scraperConfig := scraper.Config{ + URLs: urlConfigs, + OutputType: outputType, + Verbose: verbose, } + scrapedContent, err := scraper.ScrapeMultipleURLs(scraperConfig) + if err != nil { + return fmt.Errorf("error scraping content: %v", err) + } + + if outputType == "single" { + return writeSingleFile(scrapedContent) + } else { + return writeMultipleFiles(scrapedContent) + } +} + +func writeSingleFile(content map[string]string) error { + outputFile := generateDefaultFilename(urls) file, err := os.Create(outputFile) if err != nil { return fmt.Errorf("error creating output file: %v", err) } defer file.Close() - for i, u := range urls { - extractedContent, err := scrapeRecursively(u, depth) - if err != nil { - return fmt.Errorf("error scraping content from %s: %v", u, err) - } - - if i > 0 { - _, err = file.WriteString("\n\n---\n\n") - if err != nil { - return fmt.Errorf("error writing separator to file: %v", err) - } - } - - _, err = file.WriteString(extractedContent) + for url, c := range content { + _, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s\n\n---\n\n", url, c)) if err != nil { return fmt.Errorf("error writing content to file: %v", err) } } - fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile) + fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile) + return nil +} + +func writeMultipleFiles(content map[string]string) error { + for url, c := range content { + filename := scraper.GetFilenameFromContent(c, url) + file, err := os.Create(filename) + if err != nil { + return fmt.Errorf("error creating output file %s: %v", filename, err) + } + + _, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s", url, c)) + file.Close() + if err != nil { + return fmt.Errorf("error writing content to file %s: %v", filename, err) + } + + fmt.Printf("Content from %s has been saved to %s\n", url, filename) + } return nil } diff --git a/internal/config/config.go b/internal/config/config.go index 7ecc49d..95fed75 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -15,8 +15,14 @@ type Config struct { } type ScrapeConfig struct { - URL string `yaml:"url"` - CSSLocator string `yaml:"css_locator"` + URLs []URLConfig `yaml:"urls"` + OutputType string `yaml:"output_type"` +} + +type URLConfig struct { + URL string `yaml:"url"` + CSSLocator string `yaml:"css_locator"` + OutputAlias string `yaml:"output_alias"` } func Load(configPath string) (*Config, error) { diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 5ed43fb..39e07bf 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -22,10 +22,85 @@ var ( // Config holds the scraper configuration type Config struct { - CSSLocator string + URLs []URLConfig + OutputType string Verbose bool } +// ScrapeMultipleURLs scrapes multiple URLs concurrently +func ScrapeMultipleURLs(config Config) (map[string]string, error) { + results := make(chan struct { + url string + content string + err error + }, len(config.URLs)) + + for _, urlConfig := range config.URLs { + go func(cfg URLConfig) { + content, err := scrapeURL(cfg) + results <- struct { + url string + content string + err error + }{cfg.URL, content, err} + }(urlConfig) + } + + scrapedContent := make(map[string]string) + for i := 0; i < len(config.URLs); i++ { + result := <-results + if result.err != nil { + logger.Printf("Error scraping %s: %v\n", result.url, result.err) + continue + } + scrapedContent[result.url] = result.content + } + + return scrapedContent, nil +} + +func scrapeURL(config URLConfig) (string, error) { + content, err := FetchWebpageContent(config.URL) + if err != nil { + return "", err + } + + if config.CSSLocator != "" { + content, err = ExtractContentWithCSS(content, config.CSSLocator, nil) + if err != nil { + return "", err + } + } + + return ProcessHTMLContent(content, Config{}) +} + +func getFilenameFromContent(content, url string) string { + // Try to extract title from content + titleStart := strings.Index(content, "") + titleEnd := strings.Index(content, "") + if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart { + title := content[titleStart+7 : titleEnd] + return sanitizeFilename(title) + ".md" + } + + // If no title found, use the URL + return sanitizeFilename(url) + ".md" +} + +func sanitizeFilename(name string) string { + // Remove any character that isn't alphanumeric, dash, or underscore + reg, _ := regexp.Compile("[^a-zA-Z0-9-_]+") + return reg.ReplaceAllString(name, "_") +} + +// URLConfig holds configuration for a single URL +type URLConfig struct { + URL string + CSSLocator string + OutputAlias string +} + // SetupLogger initializes the logger based on the verbose flag func SetupLogger(verbose bool) { if verbose {