fix: Simplify sanitizeFilename function

2025-12-15 23:13:22 +00:00 · 2024-09-14 20:55:34 -05:00
parent b6de9d211b
commit 939cffb55e
1 changed files with 7 additions and 148 deletions
--- a/cmd/web.go
+++ b/cmd/web.go
@@ -4,7 +4,6 @@ import (
 	"fmt"
 	"net/url"
 	"os"
 	"path/filepath"
 	"regexp"
 	"strings"
 	"time"
@@ -15,10 +14,10 @@ import (
 )
 var (
-	urls            []string
+	urls             []string
-	outputType      string
+	outputType       string
-	depth           int
+	depth            int
-	includeSelector string
+	includeSelector  string
 	excludeSelectors []string
 )
@@ -31,8 +30,6 @@ var webCmd = &cobra.Command{
 	RunE:  runWeb,
 }
 var scraperConfig scraper.Config
 func init() {
 	rootCmd.AddCommand(webCmd)
 	webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
@@ -212,152 +209,14 @@ func sanitizeFilename(name string) string {
 	// Remove any character that isn't alphanumeric, dash, or underscore
 	reg := regexp.MustCompile("[^a-zA-Z0-9-_]+")
 	name = reg.ReplaceAllString(name, "_")
-	
+
 	// Trim any leading or trailing underscores
 	name = strings.Trim(name, "_")
-	
+
 	// If the name is empty after sanitization, use a default name
 	if name == "" {
 		name = "untitled"
 	}
-	
+
 	return name
 }
 func writeSingleFile(content map[string]string) error {
 	outputFile := generateDefaultFilename(urls)
 	file, err := os.Create(outputFile)
 	if err != nil {
 		return fmt.Errorf("error creating output file: %v", err)
 	}
 	defer file.Close()
 	for url, c := range content {
 		_, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s\n\n---\n\n", url, c))
 		if err != nil {
 			return fmt.Errorf("error writing content to file: %v", err)
 		}
 	}
 	fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile)
 	return nil
 }
 func writeMultipleFiles(content map[string]string) error {
 	for url, c := range content {
 		filename := getFilenameFromContent(c, url)
 		file, err := os.Create(filename)
 		if err != nil {
 			return fmt.Errorf("error creating output file %s: %v", filename, err)
 		}
 		_, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s", url, c))
 		file.Close()
 		if err != nil {
 			return fmt.Errorf("error writing content to file %s: %v", filename, err)
 		}
 		fmt.Printf("Content from %s has been saved to %s\n", url, filename)
 	}
 	return nil
 }
 func generateDefaultFilename(urls []string) string {
 	timestamp := time.Now().Format("20060102-150405")
 	return fmt.Sprintf("rollup-web-%s.md", timestamp)
 }
 func scrapeRecursively(urlStr string, depth int) (string, error) {
 	visited := make(map[string]bool)
 	return scrapeURL(urlStr, depth, visited)
 }
 func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error) {
 	if depth < 0 || visited[urlStr] {
 		return "", nil
 	}
 	visited[urlStr] = true
 	content, err := extractAndConvertContent(urlStr)
 	if err != nil {
 		return "", err
 	}
 	if depth > 0 {
 		links, err := scraper.ExtractLinks(urlStr)
 		if err != nil {
 			return content, fmt.Errorf("error extracting links: %v", err)
 		}
 		for _, link := range links {
 			subContent, err := scrapeURL(link, depth-1, visited)
 			if err != nil {
 				fmt.Printf("Warning: Error scraping %s: %v\n", link, err)
 				continue
 			}
 			content += "\n\n---\n\n" + subContent
 		}
 	}
 	return content, nil
 }
 func extractAndConvertContent(urlStr string) (string, error) {
 	content, err := scraper.FetchWebpageContent(urlStr)
 	if err != nil {
 		return "", fmt.Errorf("error fetching webpage content: %v", err)
 	}
 	if includeSelector != "" {
 		content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors)
 		if err != nil {
 			return "", fmt.Errorf("error extracting content with CSS: %v", err)
 		}
 	}
 	// Create a new converter
 	converter := md.NewConverter("", true, nil)
 	// Convert HTML to Markdown
 	markdown, err := converter.ConvertString(content)
 	if err != nil {
 		return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
 	}
 	parsedURL, err := url.Parse(urlStr)
 	if err != nil {
 		return "", fmt.Errorf("error parsing URL: %v", err)
 	}
 	header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
 	return header + markdown + "\n\n", nil
 }
 func getFilenameFromContent(content, url string) string {
 	// Try to extract title from content
 	titleStart := strings.Index(content, "<title>")
 	titleEnd := strings.Index(content, "</title>")
 	if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart {
 		title := content[titleStart+7 : titleEnd]
 		return sanitizeFilename(title) + ".md"
 	}
 	// If no title found, use the URL
 	return sanitizeFilename(url) + ".md"
 }
 func sanitizeFilename(name string) string {
 	// Remove any character that isn't alphanumeric, dash, or underscore
 	reg := regexp.MustCompile("[^a-zA-Z0-9-_]+")
 	name = reg.ReplaceAllString(name, "_")
 	// Trim any leading or trailing underscores
 	name = strings.Trim(name, "_")
 	// If the name is empty after sanitization, use a default name
 	if name == "" {
 		name = "untitled"
 	}
 	return name
 }