From 939cffb55e8be3888e421332b3271de37f1c0917 Mon Sep 17 00:00:00 2001 From: Arik Jones Date: Sat, 14 Sep 2024 20:55:34 -0500 Subject: [PATCH] fix: Simplify sanitizeFilename function --- cmd/web.go | 155 +++-------------------------------------------------- 1 file changed, 7 insertions(+), 148 deletions(-) diff --git a/cmd/web.go b/cmd/web.go index fd21ab6..1df7642 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -4,7 +4,6 @@ import ( "fmt" "net/url" "os" - "path/filepath" "regexp" "strings" "time" @@ -15,10 +14,10 @@ import ( ) var ( - urls []string - outputType string - depth int - includeSelector string + urls []string + outputType string + depth int + includeSelector string excludeSelectors []string ) @@ -31,8 +30,6 @@ var webCmd = &cobra.Command{ RunE: runWeb, } -var scraperConfig scraper.Config - func init() { rootCmd.AddCommand(webCmd) webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") @@ -212,152 +209,14 @@ func sanitizeFilename(name string) string { // Remove any character that isn't alphanumeric, dash, or underscore reg := regexp.MustCompile("[^a-zA-Z0-9-_]+") name = reg.ReplaceAllString(name, "_") - + // Trim any leading or trailing underscores name = strings.Trim(name, "_") - + // If the name is empty after sanitization, use a default name if name == "" { name = "untitled" } - - return name -} -func writeSingleFile(content map[string]string) error { - outputFile := generateDefaultFilename(urls) - file, err := os.Create(outputFile) - if err != nil { - return fmt.Errorf("error creating output file: %v", err) - } - defer file.Close() - - for url, c := range content { - _, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s\n\n---\n\n", url, c)) - if err != nil { - return fmt.Errorf("error writing content to file: %v", err) - } - } - - fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile) - return nil -} - -func writeMultipleFiles(content map[string]string) error { - for url, c := range content { - filename := getFilenameFromContent(c, url) - file, err := os.Create(filename) - if err != nil { - return fmt.Errorf("error creating output file %s: %v", filename, err) - } - - _, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s", url, c)) - file.Close() - if err != nil { - return fmt.Errorf("error writing content to file %s: %v", filename, err) - } - - fmt.Printf("Content from %s has been saved to %s\n", url, filename) - } - return nil -} - -func generateDefaultFilename(urls []string) string { - timestamp := time.Now().Format("20060102-150405") - return fmt.Sprintf("rollup-web-%s.md", timestamp) -} - -func scrapeRecursively(urlStr string, depth int) (string, error) { - visited := make(map[string]bool) - return scrapeURL(urlStr, depth, visited) -} - -func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error) { - if depth < 0 || visited[urlStr] { - return "", nil - } - - visited[urlStr] = true - - content, err := extractAndConvertContent(urlStr) - if err != nil { - return "", err - } - - if depth > 0 { - links, err := scraper.ExtractLinks(urlStr) - if err != nil { - return content, fmt.Errorf("error extracting links: %v", err) - } - - for _, link := range links { - subContent, err := scrapeURL(link, depth-1, visited) - if err != nil { - fmt.Printf("Warning: Error scraping %s: %v\n", link, err) - continue - } - content += "\n\n---\n\n" + subContent - } - } - - return content, nil -} - -func extractAndConvertContent(urlStr string) (string, error) { - content, err := scraper.FetchWebpageContent(urlStr) - if err != nil { - return "", fmt.Errorf("error fetching webpage content: %v", err) - } - - if includeSelector != "" { - content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors) - if err != nil { - return "", fmt.Errorf("error extracting content with CSS: %v", err) - } - } - - // Create a new converter - converter := md.NewConverter("", true, nil) - - // Convert HTML to Markdown - markdown, err := converter.ConvertString(content) - if err != nil { - return "", fmt.Errorf("error converting HTML to Markdown: %v", err) - } - - parsedURL, err := url.Parse(urlStr) - if err != nil { - return "", fmt.Errorf("error parsing URL: %v", err) - } - header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String()) - - return header + markdown + "\n\n", nil -} - -func getFilenameFromContent(content, url string) string { - // Try to extract title from content - titleStart := strings.Index(content, "") - titleEnd := strings.Index(content, "") - if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart { - title := content[titleStart+7 : titleEnd] - return sanitizeFilename(title) + ".md" - } - - // If no title found, use the URL - return sanitizeFilename(url) + ".md" -} - -func sanitizeFilename(name string) string { - // Remove any character that isn't alphanumeric, dash, or underscore - reg := regexp.MustCompile("[^a-zA-Z0-9-_]+") - name = reg.ReplaceAllString(name, "_") - - // Trim any leading or trailing underscores - name = strings.Trim(name, "_") - - // If the name is empty after sanitization, use a default name - if name == "" { - name = "untitled" - } - + return name }