diff --git a/cmd/web.go b/cmd/web.go index 60626b1..2132d7e 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -12,10 +12,11 @@ import ( ) var ( - urls []string - outputFile string - depth int - cssSelector string + urls []string + outputFile string + depth int + includeSelector string + excludeSelectors []string ) var scraperConfig scraper.Config @@ -32,7 +33,8 @@ func init() { webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-.md)") webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)") - webCmd.Flags().StringVar(&cssSelector, "css", "", "CSS selector to extract specific content (use '-' to exclude elements, e.g., 'main - .ads - .navigation')") + webCmd.Flags().StringVar(&includeSelector, "include", "", "CSS selector to extract specific content") + webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)") } func runWeb(cmd *cobra.Command, args []string) error { @@ -125,8 +127,12 @@ func extractAndConvertContent(urlStr string) (string, error) { return "", fmt.Errorf("error fetching webpage content: %v", err) } - // The content is already extracted using the main element, - // so we don't need to use ExtractContentWithCSS or ExtractContentWithXPath here + if includeSelector != "" { + content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors) + if err != nil { + return "", fmt.Errorf("error extracting content with CSS: %v", err) + } + } // Create a new converter converter := md.NewConverter("", true, nil) diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 823eef7..85ef93b 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -243,16 +243,14 @@ func ExtractLinks(urlStr string) ([]string, error) { } // ExtractContentWithCSS extracts content from HTML using a CSS selector -func ExtractContentWithCSS(content, selector string) (string, error) { - log.Printf("Extracting content with CSS selector: %s\n", selector) +func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) { + log.Printf("Extracting content with CSS selector: %s\n", includeSelector) doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) if err != nil { return "", fmt.Errorf("error parsing HTML: %v", err) } - includeSelector, excludeSelectors := parseSelectors(selector) - selection := doc.Find(includeSelector) if selection.Length() == 0 { return "", fmt.Errorf("no content found with CSS selector: %s", includeSelector) @@ -270,14 +268,3 @@ func ExtractContentWithCSS(content, selector string) (string, error) { log.Printf("Extracted content length: %d\n", len(selectedContent)) return selectedContent, nil } - -// parseSelectors splits the CSS selector string into include and exclude parts -func parseSelectors(selector string) (string, []string) { - parts := strings.Split(selector, "-") - includeSelector := strings.TrimSpace(parts[0]) - var excludeSelectors []string - for _, part := range parts[1:] { - excludeSelectors = append(excludeSelectors, strings.TrimPrefix(part, " ")) - } - return includeSelector, excludeSelectors -}