diff --git a/cmd/web.go b/cmd/web.go index 688d9d9..fa993d4 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -14,12 +14,28 @@ import ( ) var ( - urls []string - outputFile string - depth int - cssSelector string + urls []string + outputFile string + depth int + cssSelector string xpathSelector string ) + +var webCmd = &cobra.Command{ + Use: "web", + Short: "Scrape main content from webpages and convert to Markdown", + Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`, + RunE: runWeb, +} + +func init() { + rootCmd.AddCommand(webCmd) + webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") + webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-.md)") + webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)") + webCmd.Flags().StringVar(&cssSelector, "css", "", "CSS selector to extract specific content") + webCmd.Flags().StringVar(&xpathSelector, "xpath", "", "XPath selector to extract specific content") +} // // var webCmd = &cobra.Command{ // Use: "web", @@ -155,33 +171,90 @@ var ( // // return header + markdown + "\n\n", nil // } -func scrapeRecursively(urlStr string, currentDepth int) (string, error) { - if currentDepth < 0 { - return "", nil - } - - content, err := extractAndConvertContent(urlStr) +func runWeb(cmd *cobra.Command, args []string) error { + var err error + cfg, err = config.Load("rollup.yml") if err != nil { - return "", err - } - - if currentDepth == 0 { - return content, nil - } - - links, err := scraper.ExtractLinks(urlStr) - if err != nil { - return content, err - } - - for _, link := range links { - subContent, err := scrapeRecursively(link, currentDepth-1) - if err != nil { - fmt.Printf("Warning: Error scraping %s: %v\n", link, err) - continue + if os.IsNotExist(err) { + return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments") } - content += "\n\n---\n\n" + subContent + return fmt.Errorf("error loading configuration: %v", err) } - return content, nil + // Use config if available, otherwise use command-line flags + if len(urls) == 0 && cfg.Scrape.URL != "" { + urls = []string{cfg.Scrape.URL} + } + + if len(urls) == 0 { + return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file") + } + + if outputFile == "" { + outputFile = generateDefaultFilename(urls) + } + + file, err := os.Create(outputFile) + if err != nil { + return fmt.Errorf("error creating output file: %v", err) + } + defer file.Close() + + for i, u := range urls { + extractedContent, err := scrapeRecursively(u, depth) + if err != nil { + return fmt.Errorf("error scraping content from %s: %v", u, err) + } + + if i > 0 { + _, err = file.WriteString("\n\n---\n\n") + if err != nil { + return fmt.Errorf("error writing separator to file: %v", err) + } + } + + _, err = file.WriteString(extractedContent) + if err != nil { + return fmt.Errorf("error writing content to file: %v", err) + } + } + + fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile) + return nil +} + +func extractAndConvertContent(urlStr string) (string, error) { + content, err := scraper.FetchWebpageContent(urlStr) + if err != nil { + return "", fmt.Errorf("error fetching webpage content: %v", err) + } + + if cssSelector != "" { + content, err = scraper.ExtractContentWithCSS(content, cssSelector) + if err != nil { + return "", fmt.Errorf("error extracting content with CSS selector: %v", err) + } + } else if xpathSelector != "" { + content, err = scraper.ExtractContentWithXPath(content, xpathSelector) + if err != nil { + return "", fmt.Errorf("error extracting content with XPath selector: %v", err) + } + } + + // Create a new converter + converter := md.NewConverter("", true, nil) + + // Convert HTML to Markdown + markdown, err := converter.ConvertString(content) + if err != nil { + return "", fmt.Errorf("error converting HTML to Markdown: %v", err) + } + + parsedURL, err := url.Parse(urlStr) + if err != nil { + return "", fmt.Errorf("error parsing URL: %v", err) + } + header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String()) + + return header + markdown + "\n\n", nil }