rollup/cmd/web.go

package cmd

import (
	"fmt"
	"io"
	"log"
	"os"
	"strings"
	"time"

	"github.com/spf13/cobra"
	"github.com/tnypxl/rollup/internal/config"
	"github.com/tnypxl/rollup/internal/scraper"
)

var (
	urls             []string
	outputType       string
	depth            int
	includeSelector  string
	excludeSelectors []string
)

var webCmd = &cobra.Command{
	Use:   "web",
	Short: "Scrape main content from webpages and convert to Markdown",
	Long:  `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
	RunE:  runWeb,
}

func init() {
	webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
	webCmd.Flags().StringVarP(&outputType, "output", "o", "single", "Output type: 'single' for one file, 'separate' for multiple files")
	webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)")
	webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content")
	webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)")
}

func validateScrapeConfig(scrapeConfig config.ScrapeConfig) error {
	if scrapeConfig.RequestsPerSecond <= 0 {
		return fmt.Errorf("requests_per_second must be greater than 0")
	}
	if scrapeConfig.BurstLimit <= 0 {
		return fmt.Errorf("burst_limit must be greater than 0")
	}
	return nil
}

func runWeb(cmd *cobra.Command, args []string) error {
    scraper.SetupLogger(verbose)
    logger := log.New(os.Stdout, "WEB: ", log.LstdFlags)
    if !verbose {
        logger.SetOutput(io.Discard)
    }
    logger.Printf("Starting web scraping process with verbose mode: %v", verbose)

    // Prepare site configurations
    var siteConfigs []scraper.SiteConfig
    if len(cfg.Scrape.Sites) > 0 {
        // Use configurations from rollup.yml
        logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites))
        siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites))
        for i, site := range cfg.Scrape.Sites {
            siteConfigs[i] = scraper.SiteConfig{
                BaseURL:          site.BaseURL,
                CSSLocator:       site.CSSLocator,
                ExcludeSelectors: site.ExcludeSelectors,
                MaxDepth:         site.MaxDepth,
                AllowedPaths:     site.AllowedPaths,
                ExcludePaths:     site.ExcludePaths,
                OutputAlias:      site.OutputAlias,
                PathOverrides:    convertPathOverrides(site.PathOverrides),
            }
            logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d, AllowedPaths=%v",
                i+1, site.BaseURL, site.CSSLocator, site.MaxDepth, site.AllowedPaths)
        }
    } else {
        // Use command-line URLs
        if len(urls) == 0 {
            logger.Println("Error: No URLs provided via --urls flag")
            return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.sites' in the rollup.yml file")
        }
        siteConfigs = make([]scraper.SiteConfig, len(urls))
        for i, u := range urls {
            siteConfigs[i] = scraper.SiteConfig{
                BaseURL:          u,
                CSSLocator:       includeSelector,
                ExcludeSelectors: excludeSelectors,
                MaxDepth:         depth,
                AllowedPaths:     []string{"/"}, // Allow all paths by default
            }
            logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d",
                i+1, u, includeSelector, depth)
        }
    }

    // Set up scraper configuration
    scraperConfig := scraper.Config{
        Sites:      siteConfigs,
        OutputType: outputType,
        Verbose:    verbose,
        Scrape: scraper.ScrapeConfig{
            RequestsPerSecond: cfg.Scrape.RequestsPerSecond,
            BurstLimit:        cfg.Scrape.BurstLimit,
        },
    }
    logger.Printf("Scraper configuration: OutputType=%s, RequestsPerSecond=%f, BurstLimit=%d",
        outputType, scraperConfig.Scrape.RequestsPerSecond, scraperConfig.Scrape.BurstLimit)

    // Validate scrape configuration
    err := validateScrapeConfig(cfg.Scrape)
    if err != nil {
    	logger.Printf("Invalid scrape configuration: %v", err)
    	return err
    }

    // Start scraping using scraper.ScrapeSites
    logger.Println("Starting scraping process")
    scrapedContent, err := scraper.ScrapeSites(scraperConfig)
    if err != nil {
        logger.Printf("Error occurred during scraping: %v", err)
        return fmt.Errorf("error scraping content: %v", err)
    }
    logger.Printf("Scraping completed. Total content scraped: %d", len(scrapedContent))

    // Write output to files
    if outputType == "single" {
        logger.Println("Writing content to a single file")
        return writeSingleFile(scrapedContent)
    } else {
        logger.Println("Writing content to multiple files")
        return writeMultipleFiles(scrapedContent)
    }
}

func writeSingleFile(content map[string]string) error {
	outputFile := generateDefaultFilename()
	file, err := os.Create(outputFile)
	if err != nil {
		return fmt.Errorf("error creating output file: %v", err)
	}
	defer file.Close()

	for url, c := range content {
		_, err = fmt.Fprintf(file, "# ::: Content from %s\n\n%s\n\n---\n\n", url, c)
		if err != nil {
			return fmt.Errorf("error writing content to file: %v", err)
		}
	}

	fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile)
	return nil
}

func writeMultipleFiles(content map[string]string) error {
	for url, c := range content {
		filename := sanitizeFilename(url) + ".rollup.md"
		file, err := os.Create(filename)
		if err != nil {
			return fmt.Errorf("error creating output file %s: %v", filename, err)
		}

		_, err = file.WriteString(fmt.Sprintf("# ::: Content from %s\n\n%s\n", url, c))
		if err != nil {
			file.Close()
			return fmt.Errorf("error writing content to file %s: %v", filename, err)
		}

		file.Close()
		fmt.Printf("Content from %s has been saved to %s\n", url, filename)
	}

	return nil
}

func generateDefaultFilename() string {
	timestamp := time.Now().Format("20060102-150405")
	return fmt.Sprintf("web-%s.rollup.md", timestamp)
}

func sanitizeFilename(name string) string {
	// Remove any character that isn't alphanumeric, dash, or underscore
	name = strings.Map(func(r rune) rune {
		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '-' || r == '_' {
			return r
		}
		return '_'
	}, name)

	// Trim any leading or trailing underscores
	name = strings.Trim(name, "_")

	// If the name is empty after sanitization, use a default name
	if name == "" {
		name = "untitled"
	}

	return name
}

func convertPathOverrides(configOverrides []config.PathOverride) []scraper.PathOverride {
	scraperOverrides := make([]scraper.PathOverride, len(configOverrides))
	for i, override := range configOverrides {
		scraperOverrides[i] = scraper.PathOverride{
			Path:             override.Path,
			CSSLocator:       override.CSSLocator,
			ExcludeSelectors: override.ExcludeSelectors,
		}
	}
	return scraperOverrides
}