feat: Implement web scraping and Markdown conversion

This commit is contained in:
Arik Jones
2024-09-05 23:06:43 -05:00
committed by Arik Jones (aider)
parent 5824f362b6
commit 5ab1a97f1c

View File

@@ -1,107 +1,154 @@
package cmd package cmd
import ( import (
"context"
"fmt" "fmt"
"io/ioutil" "net/url"
"net/http"
"os" "os"
"strings" "strings"
"time" "time"
"github.com/anthropics/anthropic-sdk-go" "github.com/JohannesKaufmann/html-to-markdown"
"github.com/spf13/cobra" "github.com/spf13/cobra"
"github.com/tnypxl/rollup/internal/config"
"github.com/tnypxl/rollup/internal/scraper"
)
var (
urls []string
outputFile string
cfg *config.Config
) )
var webCmd = &cobra.Command{ var webCmd = &cobra.Command{
Use: "web <url>", Use: "web",
Short: "Fetch and summarize web content", Short: "Scrape main content from webpages and convert to Markdown",
Long: `This command fetches the content of a web page, summarizes it, and saves it as a markdown file.`, Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
Args: cobra.ExactArgs(1), RunE: runWeb,
Run: runWeb,
} }
func init() { func init() {
rootCmd.AddCommand(webCmd) rootCmd.AddCommand(webCmd)
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
} }
func runWeb(cmd *cobra.Command, args []string) { func runWeb(cmd *cobra.Command, args []string) error {
url := args[0] var err error
content, err := fetchWebContent(url) cfg, err = config.Load("rollup.yml")
if err != nil { if err != nil {
fmt.Printf("Error fetching web content: %v\n", err) if os.IsNotExist(err) {
return return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments")
}
return fmt.Errorf("error loading configuration: %v", err)
} }
summary, err := summarizeContent(content) // Use config if available, otherwise use command-line flags
if err != nil { if len(urls) == 0 && cfg.Scrape.URL != "" {
fmt.Printf("Error summarizing content: %v\n", err) urls = []string{cfg.Scrape.URL}
return
} }
err = saveToMarkdown(url, summary) if len(urls) == 0 {
if err != nil { return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file")
fmt.Printf("Error saving markdown: %v\n", err)
return
} }
fmt.Println("Web content summarized and saved successfully.") if outputFile == "" {
outputFile = generateDefaultFilename(urls)
}
file, err := os.Create(outputFile)
if err != nil {
return fmt.Errorf("error creating output file: %v", err)
}
defer file.Close()
for i, u := range urls {
extractedContent, err := extractAndConvertContent(u)
if err != nil {
return fmt.Errorf("error extracting and converting content from %s: %v", u, err)
}
if i > 0 {
_, err = file.WriteString("\n\n---\n\n")
if err != nil {
return fmt.Errorf("error writing separator to file: %v", err)
}
}
_, err = file.WriteString(extractedContent)
if err != nil {
return fmt.Errorf("error writing content to file: %v", err)
}
}
fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile)
return nil
} }
func fetchWebContent(url string) (string, error) { func generateDefaultFilename(urls []string) string {
resp, err := http.Get(url) var hostnames []string
if err != nil { for _, u := range urls {
return "", err parsedURL, err := url.Parse(u)
} if err == nil {
defer resp.Body.Close() hostnames = append(hostnames, parsedURL.Hostname())
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "", err
} }
return string(body), nil var baseFilename string
if len(hostnames) == 1 {
baseFilename = hostnames[0]
} else if len(hostnames) == 2 {
baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1])
} else if len(hostnames) > 2 {
baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1)
} else {
baseFilename = "web-content"
}
baseFilename = strings.NewReplacer(
".com", "",
".org", "",
".net", "",
".edu", "",
".", "-",
).Replace(baseFilename)
if len(baseFilename) > 50 {
baseFilename = baseFilename[:50]
}
timestamp := time.Now().Format("20060102-150405")
return fmt.Sprintf("%s-%s.md", baseFilename, timestamp)
} }
func summarizeContent(content string) (string, error) { func extractAndConvertContent(urlStr string) (string, error) {
client, err := anthropic.NewClient(os.Getenv("ANTHROPIC_API_KEY")) content, err := scraper.FetchWebpageContent(urlStr)
if err != nil { if err != nil {
return "", fmt.Errorf("error creating Anthropic client: %v", err) return "", fmt.Errorf("error fetching webpage content: %v", err)
} }
ctx := context.Background() // Use the CSS locator from the config
msg, err := client.Messages.Create(ctx, &anthropic.MessageCreateParams{ cssLocator := cfg.Scrape.CSSLocator
Model: anthropic.Claude3Sonnet20240229, if cssLocator != "" {
MaxTokens: anthropic.IntPtr(1000), content, err = scraper.ExtractContentWithCSS(content, cssLocator)
System: anthropic.StringPtr("You are a helpful assistant that summarizes web content in markdown format."), if err != nil {
Messages: []anthropic.MessageParam{ return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
{ }
Role: anthropic.MessageRoleUser, }
Content: []anthropic.Content{
{ // Create a new converter
Type: anthropic.ContentTypeText, converter := md.NewConverter("", true, nil)
Text: fmt.Sprintf("Summarize the following web content in markdown format:\n\n%s", content),
}, // Convert HTML to Markdown
}, markdown, err := converter.ConvertString(content)
},
},
})
if err != nil { if err != nil {
return "", err return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
} }
if len(msg.Content) == 0 || msg.Content[0].Type != anthropic.ContentTypeText { parsedURL, err := url.Parse(urlStr)
return "", fmt.Errorf("unexpected response format") if err != nil {
return "", fmt.Errorf("error parsing URL: %v", err)
} }
header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
return msg.Content[0].Text, nil return header + markdown + "\n\n", nil
}
func saveToMarkdown(url string, content string) error {
pageName := strings.TrimPrefix(strings.TrimPrefix(url, "http://"), "https://")
pageName = strings.ReplaceAll(pageName, "/", "-")
timestamp := time.Now().Format("20060102150405")
filename := fmt.Sprintf("%s-web-rollup-%s.md", pageName, timestamp)
return ioutil.WriteFile(filename, []byte(content), 0644)
} }