From 5ab1a97f1c02793e53094686fcea2bf866ec137d Mon Sep 17 00:00:00 2001 From: Arik Jones Date: Thu, 5 Sep 2024 23:06:43 -0500 Subject: [PATCH] feat: Implement web scraping and Markdown conversion --- cmd/web.go | 181 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 114 insertions(+), 67 deletions(-) diff --git a/cmd/web.go b/cmd/web.go index cfe879e..f4c8d95 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -1,107 +1,154 @@ package cmd import ( - "context" "fmt" - "io/ioutil" - "net/http" + "net/url" "os" "strings" "time" - "github.com/anthropics/anthropic-sdk-go" + "github.com/JohannesKaufmann/html-to-markdown" "github.com/spf13/cobra" + "github.com/tnypxl/rollup/internal/config" + "github.com/tnypxl/rollup/internal/scraper" +) + +var ( + urls []string + outputFile string + cfg *config.Config ) var webCmd = &cobra.Command{ - Use: "web ", - Short: "Fetch and summarize web content", - Long: `This command fetches the content of a web page, summarizes it, and saves it as a markdown file.`, - Args: cobra.ExactArgs(1), - Run: runWeb, + Use: "web", + Short: "Scrape main content from webpages and convert to Markdown", + Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`, + RunE: runWeb, } func init() { rootCmd.AddCommand(webCmd) + webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") + webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-.md)") } -func runWeb(cmd *cobra.Command, args []string) { - url := args[0] - content, err := fetchWebContent(url) +func runWeb(cmd *cobra.Command, args []string) error { + var err error + cfg, err = config.Load("rollup.yml") if err != nil { - fmt.Printf("Error fetching web content: %v\n", err) - return + if os.IsNotExist(err) { + return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments") + } + return fmt.Errorf("error loading configuration: %v", err) } - summary, err := summarizeContent(content) - if err != nil { - fmt.Printf("Error summarizing content: %v\n", err) - return + // Use config if available, otherwise use command-line flags + if len(urls) == 0 && cfg.Scrape.URL != "" { + urls = []string{cfg.Scrape.URL} } - err = saveToMarkdown(url, summary) - if err != nil { - fmt.Printf("Error saving markdown: %v\n", err) - return + if len(urls) == 0 { + return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file") } - fmt.Println("Web content summarized and saved successfully.") + if outputFile == "" { + outputFile = generateDefaultFilename(urls) + } + + file, err := os.Create(outputFile) + if err != nil { + return fmt.Errorf("error creating output file: %v", err) + } + defer file.Close() + + for i, u := range urls { + extractedContent, err := extractAndConvertContent(u) + if err != nil { + return fmt.Errorf("error extracting and converting content from %s: %v", u, err) + } + + if i > 0 { + _, err = file.WriteString("\n\n---\n\n") + if err != nil { + return fmt.Errorf("error writing separator to file: %v", err) + } + } + + _, err = file.WriteString(extractedContent) + if err != nil { + return fmt.Errorf("error writing content to file: %v", err) + } + } + + fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile) + return nil } -func fetchWebContent(url string) (string, error) { - resp, err := http.Get(url) - if err != nil { - return "", err - } - defer resp.Body.Close() - - body, err := ioutil.ReadAll(resp.Body) - if err != nil { - return "", err +func generateDefaultFilename(urls []string) string { + var hostnames []string + for _, u := range urls { + parsedURL, err := url.Parse(u) + if err == nil { + hostnames = append(hostnames, parsedURL.Hostname()) + } } - return string(body), nil + var baseFilename string + if len(hostnames) == 1 { + baseFilename = hostnames[0] + } else if len(hostnames) == 2 { + baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1]) + } else if len(hostnames) > 2 { + baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1) + } else { + baseFilename = "web-content" + } + + baseFilename = strings.NewReplacer( + ".com", "", + ".org", "", + ".net", "", + ".edu", "", + ".", "-", + ).Replace(baseFilename) + + if len(baseFilename) > 50 { + baseFilename = baseFilename[:50] + } + + timestamp := time.Now().Format("20060102-150405") + return fmt.Sprintf("%s-%s.md", baseFilename, timestamp) } -func summarizeContent(content string) (string, error) { - client, err := anthropic.NewClient(os.Getenv("ANTHROPIC_API_KEY")) +func extractAndConvertContent(urlStr string) (string, error) { + content, err := scraper.FetchWebpageContent(urlStr) if err != nil { - return "", fmt.Errorf("error creating Anthropic client: %v", err) + return "", fmt.Errorf("error fetching webpage content: %v", err) } - ctx := context.Background() - msg, err := client.Messages.Create(ctx, &anthropic.MessageCreateParams{ - Model: anthropic.Claude3Sonnet20240229, - MaxTokens: anthropic.IntPtr(1000), - System: anthropic.StringPtr("You are a helpful assistant that summarizes web content in markdown format."), - Messages: []anthropic.MessageParam{ - { - Role: anthropic.MessageRoleUser, - Content: []anthropic.Content{ - { - Type: anthropic.ContentTypeText, - Text: fmt.Sprintf("Summarize the following web content in markdown format:\n\n%s", content), - }, - }, - }, - }, - }) + // Use the CSS locator from the config + cssLocator := cfg.Scrape.CSSLocator + if cssLocator != "" { + content, err = scraper.ExtractContentWithCSS(content, cssLocator) + if err != nil { + return "", fmt.Errorf("error extracting content with CSS selector: %v", err) + } + } + + // Create a new converter + converter := md.NewConverter("", true, nil) + + // Convert HTML to Markdown + markdown, err := converter.ConvertString(content) if err != nil { - return "", err + return "", fmt.Errorf("error converting HTML to Markdown: %v", err) } - if len(msg.Content) == 0 || msg.Content[0].Type != anthropic.ContentTypeText { - return "", fmt.Errorf("unexpected response format") + parsedURL, err := url.Parse(urlStr) + if err != nil { + return "", fmt.Errorf("error parsing URL: %v", err) } + header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String()) - return msg.Content[0].Text, nil -} - -func saveToMarkdown(url string, content string) error { - pageName := strings.TrimPrefix(strings.TrimPrefix(url, "http://"), "https://") - pageName = strings.ReplaceAll(pageName, "/", "-") - timestamp := time.Now().Format("20060102150405") - filename := fmt.Sprintf("%s-web-rollup-%s.md", pageName, timestamp) - - return ioutil.WriteFile(filename, []byte(content), 0644) + return header + markdown + "\n\n", nil }