fix: Move HTML to Markdown conversion to scraper.go

This commit is contained in:
Arik Jones (aider)
2024-09-14 20:55:35 -05:00
parent 939cffb55e
commit c1755836b5
2 changed files with 13 additions and 15 deletions

View File

@@ -174,13 +174,9 @@ func extractAndConvertContent(urlStr string) (string, error) {
} }
} }
// Create a new converter markdown, err := scraper.ProcessHTMLContent(content, scraper.Config{})
converter := md.NewConverter("", true, nil)
// Convert HTML to Markdown
markdown, err := converter.ConvertString(content)
if err != nil { if err != nil {
return "", fmt.Errorf("error converting HTML to Markdown: %v", err) return "", fmt.Errorf("error processing HTML content: %v", err)
} }
parsedURL, err := url.Parse(urlStr) parsedURL, err := url.Parse(urlStr)

View File

@@ -11,7 +11,7 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
"github.com/playwright-community/playwright-go" "github.com/playwright-community/playwright-go"
"github.com/russross/blackfriday/v2" md "github.com/JohannesKaufmann/html-to-markdown"
) )
var logger *log.Logger var logger *log.Logger
@@ -231,18 +231,20 @@ func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
return "", fmt.Errorf("error extracting content: %v", err) return "", fmt.Errorf("error extracting content: %v", err)
} }
markdown := convertToMarkdown(content) // Create a new converter
converter := md.NewConverter("", true, nil)
// Convert HTML to Markdown
markdown, err := converter.ConvertString(content)
if err != nil {
log.Printf("Error converting HTML to Markdown: %v\n", err)
return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
}
log.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown)) log.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown))
return markdown, nil return markdown, nil
} }
func convertToMarkdown(html string) string {
// Use a simple HTML-to-Markdown conversion
markdown := blackfriday.Run([]byte(html),
blackfriday.WithExtensions(blackfriday.CommonExtensions|blackfriday.HardLineBreak))
return string(markdown)
}
func scrollPage(page playwright.Page) error { func scrollPage(page playwright.Page) error {
log.Println("Starting page scroll") log.Println("Starting page scroll")
script := ` script := `