mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 23:13:22 +00:00
feat: Separate include and exclude selectors in web scraper
This commit is contained in:
20
cmd/web.go
20
cmd/web.go
@@ -12,10 +12,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
urls []string
|
urls []string
|
||||||
outputFile string
|
outputFile string
|
||||||
depth int
|
depth int
|
||||||
cssSelector string
|
includeSelector string
|
||||||
|
excludeSelectors []string
|
||||||
)
|
)
|
||||||
|
|
||||||
var scraperConfig scraper.Config
|
var scraperConfig scraper.Config
|
||||||
@@ -32,7 +33,8 @@ func init() {
|
|||||||
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
||||||
webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
|
webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
|
||||||
webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)")
|
webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)")
|
||||||
webCmd.Flags().StringVar(&cssSelector, "css", "", "CSS selector to extract specific content (use '-' to exclude elements, e.g., 'main - .ads - .navigation')")
|
webCmd.Flags().StringVar(&includeSelector, "include", "", "CSS selector to extract specific content")
|
||||||
|
webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)")
|
||||||
}
|
}
|
||||||
|
|
||||||
func runWeb(cmd *cobra.Command, args []string) error {
|
func runWeb(cmd *cobra.Command, args []string) error {
|
||||||
@@ -125,8 +127,12 @@ func extractAndConvertContent(urlStr string) (string, error) {
|
|||||||
return "", fmt.Errorf("error fetching webpage content: %v", err)
|
return "", fmt.Errorf("error fetching webpage content: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// The content is already extracted using the main element,
|
if includeSelector != "" {
|
||||||
// so we don't need to use ExtractContentWithCSS or ExtractContentWithXPath here
|
content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error extracting content with CSS: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Create a new converter
|
// Create a new converter
|
||||||
converter := md.NewConverter("", true, nil)
|
converter := md.NewConverter("", true, nil)
|
||||||
|
|||||||
@@ -243,16 +243,14 @@ func ExtractLinks(urlStr string) ([]string, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ExtractContentWithCSS extracts content from HTML using a CSS selector
|
// ExtractContentWithCSS extracts content from HTML using a CSS selector
|
||||||
func ExtractContentWithCSS(content, selector string) (string, error) {
|
func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) {
|
||||||
log.Printf("Extracting content with CSS selector: %s\n", selector)
|
log.Printf("Extracting content with CSS selector: %s\n", includeSelector)
|
||||||
|
|
||||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("error parsing HTML: %v", err)
|
return "", fmt.Errorf("error parsing HTML: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
includeSelector, excludeSelectors := parseSelectors(selector)
|
|
||||||
|
|
||||||
selection := doc.Find(includeSelector)
|
selection := doc.Find(includeSelector)
|
||||||
if selection.Length() == 0 {
|
if selection.Length() == 0 {
|
||||||
return "", fmt.Errorf("no content found with CSS selector: %s", includeSelector)
|
return "", fmt.Errorf("no content found with CSS selector: %s", includeSelector)
|
||||||
@@ -270,14 +268,3 @@ func ExtractContentWithCSS(content, selector string) (string, error) {
|
|||||||
log.Printf("Extracted content length: %d\n", len(selectedContent))
|
log.Printf("Extracted content length: %d\n", len(selectedContent))
|
||||||
return selectedContent, nil
|
return selectedContent, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseSelectors splits the CSS selector string into include and exclude parts
|
|
||||||
func parseSelectors(selector string) (string, []string) {
|
|
||||||
parts := strings.Split(selector, "-")
|
|
||||||
includeSelector := strings.TrimSpace(parts[0])
|
|
||||||
var excludeSelectors []string
|
|
||||||
for _, part := range parts[1:] {
|
|
||||||
excludeSelectors = append(excludeSelectors, strings.TrimPrefix(part, " "))
|
|
||||||
}
|
|
||||||
return includeSelector, excludeSelectors
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user