mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 15:03:17 +00:00
feat: Implement web scraping and Markdown conversion
This commit is contained in:
committed by
Arik Jones (aider)
parent
5824f362b6
commit
5ab1a97f1c
181
cmd/web.go
181
cmd/web.go
@@ -1,107 +1,154 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"net/url"
|
||||||
"net/http"
|
|
||||||
"os"
|
"os"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/anthropics/anthropic-sdk-go"
|
"github.com/JohannesKaufmann/html-to-markdown"
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
|
"github.com/tnypxl/rollup/internal/config"
|
||||||
|
"github.com/tnypxl/rollup/internal/scraper"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
urls []string
|
||||||
|
outputFile string
|
||||||
|
cfg *config.Config
|
||||||
)
|
)
|
||||||
|
|
||||||
var webCmd = &cobra.Command{
|
var webCmd = &cobra.Command{
|
||||||
Use: "web <url>",
|
Use: "web",
|
||||||
Short: "Fetch and summarize web content",
|
Short: "Scrape main content from webpages and convert to Markdown",
|
||||||
Long: `This command fetches the content of a web page, summarizes it, and saves it as a markdown file.`,
|
Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
|
||||||
Args: cobra.ExactArgs(1),
|
RunE: runWeb,
|
||||||
Run: runWeb,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
rootCmd.AddCommand(webCmd)
|
rootCmd.AddCommand(webCmd)
|
||||||
|
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
||||||
|
webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
|
||||||
}
|
}
|
||||||
|
|
||||||
func runWeb(cmd *cobra.Command, args []string) {
|
func runWeb(cmd *cobra.Command, args []string) error {
|
||||||
url := args[0]
|
var err error
|
||||||
content, err := fetchWebContent(url)
|
cfg, err = config.Load("rollup.yml")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Printf("Error fetching web content: %v\n", err)
|
if os.IsNotExist(err) {
|
||||||
return
|
return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments")
|
||||||
|
}
|
||||||
|
return fmt.Errorf("error loading configuration: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
summary, err := summarizeContent(content)
|
// Use config if available, otherwise use command-line flags
|
||||||
if err != nil {
|
if len(urls) == 0 && cfg.Scrape.URL != "" {
|
||||||
fmt.Printf("Error summarizing content: %v\n", err)
|
urls = []string{cfg.Scrape.URL}
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
err = saveToMarkdown(url, summary)
|
if len(urls) == 0 {
|
||||||
if err != nil {
|
return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file")
|
||||||
fmt.Printf("Error saving markdown: %v\n", err)
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println("Web content summarized and saved successfully.")
|
if outputFile == "" {
|
||||||
|
outputFile = generateDefaultFilename(urls)
|
||||||
|
}
|
||||||
|
|
||||||
|
file, err := os.Create(outputFile)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error creating output file: %v", err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
for i, u := range urls {
|
||||||
|
extractedContent, err := extractAndConvertContent(u)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error extracting and converting content from %s: %v", u, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if i > 0 {
|
||||||
|
_, err = file.WriteString("\n\n---\n\n")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error writing separator to file: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = file.WriteString(extractedContent)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error writing content to file: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func fetchWebContent(url string) (string, error) {
|
func generateDefaultFilename(urls []string) string {
|
||||||
resp, err := http.Get(url)
|
var hostnames []string
|
||||||
if err != nil {
|
for _, u := range urls {
|
||||||
return "", err
|
parsedURL, err := url.Parse(u)
|
||||||
}
|
if err == nil {
|
||||||
defer resp.Body.Close()
|
hostnames = append(hostnames, parsedURL.Hostname())
|
||||||
|
}
|
||||||
body, err := ioutil.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return string(body), nil
|
var baseFilename string
|
||||||
|
if len(hostnames) == 1 {
|
||||||
|
baseFilename = hostnames[0]
|
||||||
|
} else if len(hostnames) == 2 {
|
||||||
|
baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1])
|
||||||
|
} else if len(hostnames) > 2 {
|
||||||
|
baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1)
|
||||||
|
} else {
|
||||||
|
baseFilename = "web-content"
|
||||||
|
}
|
||||||
|
|
||||||
|
baseFilename = strings.NewReplacer(
|
||||||
|
".com", "",
|
||||||
|
".org", "",
|
||||||
|
".net", "",
|
||||||
|
".edu", "",
|
||||||
|
".", "-",
|
||||||
|
).Replace(baseFilename)
|
||||||
|
|
||||||
|
if len(baseFilename) > 50 {
|
||||||
|
baseFilename = baseFilename[:50]
|
||||||
|
}
|
||||||
|
|
||||||
|
timestamp := time.Now().Format("20060102-150405")
|
||||||
|
return fmt.Sprintf("%s-%s.md", baseFilename, timestamp)
|
||||||
}
|
}
|
||||||
|
|
||||||
func summarizeContent(content string) (string, error) {
|
func extractAndConvertContent(urlStr string) (string, error) {
|
||||||
client, err := anthropic.NewClient(os.Getenv("ANTHROPIC_API_KEY"))
|
content, err := scraper.FetchWebpageContent(urlStr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("error creating Anthropic client: %v", err)
|
return "", fmt.Errorf("error fetching webpage content: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx := context.Background()
|
// Use the CSS locator from the config
|
||||||
msg, err := client.Messages.Create(ctx, &anthropic.MessageCreateParams{
|
cssLocator := cfg.Scrape.CSSLocator
|
||||||
Model: anthropic.Claude3Sonnet20240229,
|
if cssLocator != "" {
|
||||||
MaxTokens: anthropic.IntPtr(1000),
|
content, err = scraper.ExtractContentWithCSS(content, cssLocator)
|
||||||
System: anthropic.StringPtr("You are a helpful assistant that summarizes web content in markdown format."),
|
if err != nil {
|
||||||
Messages: []anthropic.MessageParam{
|
return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
|
||||||
{
|
}
|
||||||
Role: anthropic.MessageRoleUser,
|
}
|
||||||
Content: []anthropic.Content{
|
|
||||||
{
|
// Create a new converter
|
||||||
Type: anthropic.ContentTypeText,
|
converter := md.NewConverter("", true, nil)
|
||||||
Text: fmt.Sprintf("Summarize the following web content in markdown format:\n\n%s", content),
|
|
||||||
},
|
// Convert HTML to Markdown
|
||||||
},
|
markdown, err := converter.ConvertString(content)
|
||||||
},
|
|
||||||
},
|
|
||||||
})
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(msg.Content) == 0 || msg.Content[0].Type != anthropic.ContentTypeText {
|
parsedURL, err := url.Parse(urlStr)
|
||||||
return "", fmt.Errorf("unexpected response format")
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error parsing URL: %v", err)
|
||||||
}
|
}
|
||||||
|
header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
|
||||||
|
|
||||||
return msg.Content[0].Text, nil
|
return header + markdown + "\n\n", nil
|
||||||
}
|
|
||||||
|
|
||||||
func saveToMarkdown(url string, content string) error {
|
|
||||||
pageName := strings.TrimPrefix(strings.TrimPrefix(url, "http://"), "https://")
|
|
||||||
pageName = strings.ReplaceAll(pageName, "/", "-")
|
|
||||||
timestamp := time.Now().Format("20060102150405")
|
|
||||||
filename := fmt.Sprintf("%s-web-rollup-%s.md", pageName, timestamp)
|
|
||||||
|
|
||||||
return ioutil.WriteFile(filename, []byte(content), 0644)
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user