mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 23:13:22 +00:00
feat: Separate include and exclude selectors in web scraper
This commit is contained in:
@@ -243,16 +243,14 @@ func ExtractLinks(urlStr string) ([]string, error) {
|
||||
}
|
||||
|
||||
// ExtractContentWithCSS extracts content from HTML using a CSS selector
|
||||
func ExtractContentWithCSS(content, selector string) (string, error) {
|
||||
log.Printf("Extracting content with CSS selector: %s\n", selector)
|
||||
func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) {
|
||||
log.Printf("Extracting content with CSS selector: %s\n", includeSelector)
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error parsing HTML: %v", err)
|
||||
}
|
||||
|
||||
includeSelector, excludeSelectors := parseSelectors(selector)
|
||||
|
||||
selection := doc.Find(includeSelector)
|
||||
if selection.Length() == 0 {
|
||||
return "", fmt.Errorf("no content found with CSS selector: %s", includeSelector)
|
||||
@@ -270,14 +268,3 @@ func ExtractContentWithCSS(content, selector string) (string, error) {
|
||||
log.Printf("Extracted content length: %d\n", len(selectedContent))
|
||||
return selectedContent, nil
|
||||
}
|
||||
|
||||
// parseSelectors splits the CSS selector string into include and exclude parts
|
||||
func parseSelectors(selector string) (string, []string) {
|
||||
parts := strings.Split(selector, "-")
|
||||
includeSelector := strings.TrimSpace(parts[0])
|
||||
var excludeSelectors []string
|
||||
for _, part := range parts[1:] {
|
||||
excludeSelectors = append(excludeSelectors, strings.TrimPrefix(part, " "))
|
||||
}
|
||||
return includeSelector, excludeSelectors
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user