mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 15:03:17 +00:00
fix: Move HTML to Markdown conversion to scraper.go
This commit is contained in:
@@ -174,13 +174,9 @@ func extractAndConvertContent(urlStr string) (string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a new converter
|
markdown, err := scraper.ProcessHTMLContent(content, scraper.Config{})
|
||||||
converter := md.NewConverter("", true, nil)
|
|
||||||
|
|
||||||
// Convert HTML to Markdown
|
|
||||||
markdown, err := converter.ConvertString(content)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
|
return "", fmt.Errorf("error processing HTML content: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
parsedURL, err := url.Parse(urlStr)
|
parsedURL, err := url.Parse(urlStr)
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ import (
|
|||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
"github.com/playwright-community/playwright-go"
|
"github.com/playwright-community/playwright-go"
|
||||||
"github.com/russross/blackfriday/v2"
|
md "github.com/JohannesKaufmann/html-to-markdown"
|
||||||
)
|
)
|
||||||
|
|
||||||
var logger *log.Logger
|
var logger *log.Logger
|
||||||
@@ -231,18 +231,20 @@ func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
|
|||||||
return "", fmt.Errorf("error extracting content: %v", err)
|
return "", fmt.Errorf("error extracting content: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
markdown := convertToMarkdown(content)
|
// Create a new converter
|
||||||
|
converter := md.NewConverter("", true, nil)
|
||||||
|
|
||||||
|
// Convert HTML to Markdown
|
||||||
|
markdown, err := converter.ConvertString(content)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Error converting HTML to Markdown: %v\n", err)
|
||||||
|
return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
log.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown))
|
log.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown))
|
||||||
return markdown, nil
|
return markdown, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func convertToMarkdown(html string) string {
|
|
||||||
// Use a simple HTML-to-Markdown conversion
|
|
||||||
markdown := blackfriday.Run([]byte(html),
|
|
||||||
blackfriday.WithExtensions(blackfriday.CommonExtensions|blackfriday.HardLineBreak))
|
|
||||||
return string(markdown)
|
|
||||||
}
|
|
||||||
|
|
||||||
func scrollPage(page playwright.Page) error {
|
func scrollPage(page playwright.Page) error {
|
||||||
log.Println("Starting page scroll")
|
log.Println("Starting page scroll")
|
||||||
script := `
|
script := `
|
||||||
|
|||||||
Reference in New Issue
Block a user