feat: Add support for excluding child elements in content extraction

This commit is contained in:
Arik Jones (aider)
2024-09-14 16:49:32 -05:00
parent d0ee666b07
commit f1af20e95e

View File

@@ -251,9 +251,15 @@ func ExtractContentWithCSS(content, selector string) (string, error) {
return "", fmt.Errorf("error parsing HTML: %v", err) return "", fmt.Errorf("error parsing HTML: %v", err)
} }
selection := doc.Find(selector) includeSelector, excludeSelectors := parseSelectors(selector)
selection := doc.Find(includeSelector)
if selection.Length() == 0 { if selection.Length() == 0 {
return "", fmt.Errorf("no content found with CSS selector: %s", selector) return "", fmt.Errorf("no content found with CSS selector: %s", includeSelector)
}
for _, excludeSelector := range excludeSelectors {
selection.Find(excludeSelector).Remove()
} }
selectedContent, err := selection.Html() selectedContent, err := selection.Html()
@@ -274,6 +280,8 @@ func ExtractContentWithXPath(content, xpath string) (string, error) {
return "", fmt.Errorf("error parsing HTML: %v", err) return "", fmt.Errorf("error parsing HTML: %v", err)
} }
includeXPath, excludeXPaths := parseSelectors(xpath)
var selectedContent string var selectedContent string
doc.Find("body").Each(func(i int, s *goquery.Selection) { doc.Find("body").Each(func(i int, s *goquery.Selection) {
if content, err := s.Html(); err == nil { if content, err := s.Html(); err == nil {
@@ -282,9 +290,23 @@ func ExtractContentWithXPath(content, xpath string) (string, error) {
}) })
if selectedContent == "" { if selectedContent == "" {
return "", fmt.Errorf("no content found with XPath selector: %s", xpath) return "", fmt.Errorf("no content found with XPath selector: %s", includeXPath)
} }
// Note: XPath exclusion is not implemented here as goquery doesn't support XPath.
// You may need to use a different library for XPath support.
log.Printf("Extracted content length: %d\n", len(selectedContent)) log.Printf("Extracted content length: %d\n", len(selectedContent))
return selectedContent, nil return selectedContent, nil
} }
// parseSelectors splits the selector string into include and exclude parts
func parseSelectors(selector string) (string, []string) {
parts := strings.Split(selector, "!")
includeSelector := strings.TrimSpace(parts[0])
var excludeSelectors []string
for _, part := range parts[1:] {
excludeSelectors = append(excludeSelectors, strings.TrimSpace(part))
}
return includeSelector, excludeSelectors
}