diff --git a/cmd/web.go b/cmd/web.go index ecc24d4..e986305 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -4,9 +4,7 @@ import ( "fmt" "io" "log" - "net/url" "os" - "regexp" "strings" "time" @@ -23,8 +21,6 @@ var ( excludeSelectors []string ) -var scraperConfig scraper.Config - var webCmd = &cobra.Command{ Use: "web", Short: "Scrape main content from webpages and convert to Markdown", @@ -41,93 +37,83 @@ func init() { } func runWeb(cmd *cobra.Command, args []string) error { - scraper.SetupLogger(verbose) - logger := log.New(os.Stdout, "WEB: ", log.LstdFlags) - if !verbose { - logger.SetOutput(io.Discard) - } - logger.Printf("Starting web scraping process with verbose mode: %v", verbose) - scraperConfig.Verbose = verbose + scraper.SetupLogger(verbose) + logger := log.New(os.Stdout, "WEB: ", log.LstdFlags) + if !verbose { + logger.SetOutput(io.Discard) + } + logger.Printf("Starting web scraping process with verbose mode: %v", verbose) - var siteConfigs []scraper.SiteConfig - if len(cfg.Scrape.Sites) > 0 { - logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites)) - siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites)) - for i, site := range cfg.Scrape.Sites { - siteConfigs[i] = scraper.SiteConfig{ - BaseURL: site.BaseURL, - CSSLocator: site.CSSLocator, - ExcludeSelectors: site.ExcludeSelectors, - MaxDepth: site.MaxDepth, - AllowedPaths: site.AllowedPaths, - ExcludePaths: site.ExcludePaths, - OutputAlias: site.OutputAlias, - PathOverrides: convertPathOverrides(site.PathOverrides), - } - logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d, AllowedPaths=%v", - i+1, site.BaseURL, site.CSSLocator, site.MaxDepth, site.AllowedPaths) - } - } else { - logger.Printf("No sites defined in rollup.yml, falling back to URL-based configuration") - siteConfigs = make([]scraper.SiteConfig, len(urls)) - for i, u := range urls { - siteConfigs[i] = scraper.SiteConfig{ - BaseURL: u, - CSSLocator: includeSelector, - ExcludeSelectors: excludeSelectors, - MaxDepth: depth, - } - logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d", - i+1, u, includeSelector, depth) - } - } + // Prepare site configurations + var siteConfigs []scraper.SiteConfig + if len(cfg.Scrape.Sites) > 0 { + // Use configurations from rollup.yml + logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites)) + siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites)) + for i, site := range cfg.Scrape.Sites { + siteConfigs[i] = scraper.SiteConfig{ + BaseURL: site.BaseURL, + CSSLocator: site.CSSLocator, + ExcludeSelectors: site.ExcludeSelectors, + MaxDepth: site.MaxDepth, + AllowedPaths: site.AllowedPaths, + ExcludePaths: site.ExcludePaths, + OutputAlias: site.OutputAlias, + PathOverrides: convertPathOverrides(site.PathOverrides), + } + logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d, AllowedPaths=%v", + i+1, site.BaseURL, site.CSSLocator, site.MaxDepth, site.AllowedPaths) + } + } else { + // Use command-line URLs + if len(urls) == 0 { + logger.Println("Error: No URLs provided via --urls flag") + return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.sites' in the rollup.yml file") + } + siteConfigs = make([]scraper.SiteConfig, len(urls)) + for i, u := range urls { + siteConfigs[i] = scraper.SiteConfig{ + BaseURL: u, + CSSLocator: includeSelector, + ExcludeSelectors: excludeSelectors, + MaxDepth: depth, + AllowedPaths: []string{"/"}, // Allow all paths by default + } + logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d", + i+1, u, includeSelector, depth) + } + } - if len(siteConfigs) == 0 { - logger.Println("Error: No sites or URLs provided") - return fmt.Errorf("no sites or URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.sites' in the rollup.yml file") - } + // Set up scraper configuration + scraperConfig := scraper.Config{ + Sites: siteConfigs, + OutputType: outputType, + Verbose: verbose, + Scrape: scraper.ScrapeConfig{ + RequestsPerSecond: cfg.Scrape.RequestsPerSecond, + BurstLimit: cfg.Scrape.BurstLimit, + }, + } + logger.Printf("Scraper configuration: OutputType=%s, RequestsPerSecond=%f, BurstLimit=%d", + outputType, scraperConfig.Scrape.RequestsPerSecond, scraperConfig.Scrape.BurstLimit) - // Set default values for rate limiting - defaultRequestsPerSecond := 1.0 - defaultBurstLimit := 3 + // Start scraping using scraper.ScrapeSites + logger.Println("Starting scraping process") + scrapedContent, err := scraper.ScrapeSites(scraperConfig) + if err != nil { + logger.Printf("Error occurred during scraping: %v", err) + return fmt.Errorf("error scraping content: %v", err) + } + logger.Printf("Scraping completed. Total content scraped: %d", len(scrapedContent)) - // Use default values if not set in the configuration - requestsPerSecond := cfg.Scrape.RequestsPerSecond - if requestsPerSecond == 0 { - requestsPerSecond = defaultRequestsPerSecond - } - burstLimit := cfg.Scrape.BurstLimit - if burstLimit == 0 { - burstLimit = defaultBurstLimit - } - - scraperConfig := scraper.Config{ - Sites: siteConfigs, - OutputType: outputType, - Verbose: verbose, - Scrape: scraper.ScrapeConfig{ - RequestsPerSecond: requestsPerSecond, - BurstLimit: burstLimit, - }, - } - logger.Printf("Scraper configuration: OutputType=%s, RequestsPerSecond=%f, BurstLimit=%d", - outputType, requestsPerSecond, burstLimit) - - logger.Println("Starting scraping process") - scrapedContent, err := scraper.ScrapeSites(scraperConfig) - if err != nil { - logger.Printf("Error occurred during scraping: %v", err) - return fmt.Errorf("error scraping content: %v", err) - } - logger.Printf("Scraping completed. Total content scraped: %d", len(scrapedContent)) - - if outputType == "single" { - logger.Println("Writing content to a single file") - return writeSingleFile(scrapedContent) - } else { - logger.Println("Writing content to multiple files") - return writeMultipleFiles(scrapedContent) - } + // Write output to files + if outputType == "single" { + logger.Println("Writing content to a single file") + return writeSingleFile(scrapedContent) + } else { + logger.Println("Writing content to multiple files") + return writeMultipleFiles(scrapedContent) + } } func writeSingleFile(content map[string]string) error { @@ -151,11 +137,7 @@ func writeSingleFile(content map[string]string) error { func writeMultipleFiles(content map[string]string) error { for url, c := range content { - filename, err := getFilenameFromContent(c, url) - if err != nil { - return fmt.Errorf("error generating filename for %s: %v", url, err) - } - + filename := sanitizeFilename(url) + ".rollup.md" file, err := os.Create(filename) if err != nil { return fmt.Errorf("error creating output file %s: %v", filename, err) @@ -179,106 +161,14 @@ func generateDefaultFilename() string { return fmt.Sprintf("web-%s.rollup.md", timestamp) } -func scrapeRecursively(urlStr string, depth int) (string, error) { - visited := make(map[string]bool) - return scrapeURL(urlStr, depth, visited) -} - -func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error) { - if depth < 0 || visited[urlStr] { - return "", nil - } - - visited[urlStr] = true - - content, err := testExtractAndConvertContent(urlStr) - if err != nil { - return "", err - } - - if depth > 0 { - links, err := testExtractLinks(urlStr) - if err != nil { - return content, fmt.Errorf("error extracting links: %v", err) - } - - for _, link := range links { - subContent, err := scrapeURL(link, depth-1, visited) - if err != nil { - fmt.Printf("Warning: Error scraping %s: %v\n", link, err) - continue - } - content += "\n\n---\n\n" + subContent - } - } - - return content, nil -} - -var ( - testExtractAndConvertContent = extractAndConvertContent - testExtractLinks = scraper.ExtractLinks -) - -func extractAndConvertContent(urlStr string) (string, error) { - content, err := scraper.FetchWebpageContent(urlStr) - if err != nil { - return "", fmt.Errorf("error fetching webpage content: %v", err) - } - - if includeSelector != "" { - content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors) - if err != nil { - return "", fmt.Errorf("error extracting content with CSS: %v", err) - } - } - - markdown, err := scraper.ProcessHTMLContent(content, scraper.Config{}) - if err != nil { - return "", fmt.Errorf("error processing HTML content: %v", err) - } - - parsedURL, err := url.Parse(urlStr) - if err != nil { - return "", fmt.Errorf("error parsing URL: %v", err) - } - header := fmt.Sprintf("# ::: Content from %s\n\n", parsedURL.String()) - - return header + markdown + "\n\n", nil -} - -func getFilenameFromContent(content, urlStr string) (string, error) { - // Try to extract title from content - titleStart := strings.Index(content, "