From 9341a51d09cae45b27a61dd0a24ac94e6e24645e Mon Sep 17 00:00:00 2001 From: Arik Jones Date: Fri, 6 Dec 2024 17:02:31 -0600 Subject: [PATCH] fix multi-file output --- cmd/web.go | 15 +-- internal/config/config.go | 4 +- internal/config/config_test.go | 4 +- internal/scraper/scraper.go | 189 +++++++++++++++++++++++++++------ 4 files changed, 163 insertions(+), 49 deletions(-) diff --git a/cmd/web.go b/cmd/web.go index c73e32c..81b2ebe 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -33,7 +33,7 @@ var webCmd = &cobra.Command{ func init() { webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") - webCmd.Flags().StringVarP(&outputType, "output", "o", "single", "Output type: 'single' for one file, 'separate' for multiple files") + webCmd.Flags().StringVarP(&outputType, "output", "o", "", "Output type: 'single' for one file, 'separate' for multiple files") webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content") webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)") } @@ -58,7 +58,6 @@ func runWeb(cmd *cobra.Command, args []string) error { ExcludeSelectors: site.ExcludeSelectors, AllowedPaths: site.AllowedPaths, ExcludePaths: site.ExcludePaths, - OutputAlias: site.OutputAlias, PathOverrides: convertPathOverrides(site.PathOverrides), } logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, AllowedPaths=%v", @@ -132,7 +131,7 @@ func runWeb(cmd *cobra.Command, args []string) error { } }() - scrapedContent, err := scraper.ScrapeSites(scraperConfig) + err := scraper.ScrapeSites(scraperConfig) done <- true fmt.Println() // New line after progress indicator @@ -140,15 +139,9 @@ func runWeb(cmd *cobra.Command, args []string) error { logger.Printf("Error occurred during scraping: %v", err) return fmt.Errorf("error scraping content: %v", err) } - logger.Printf("Scraping completed. Total content scraped: %d", len(scrapedContent)) + logger.Println("Scraping completed") - if outputType == "single" { - logger.Println("Writing content to a single file") - return writeSingleFile(scrapedContent) - } else { - logger.Println("Writing content to multiple files") - return writeMultipleFiles(scrapedContent) - } + return nil } func writeSingleFile(content map[string]string) error { diff --git a/internal/config/config.go b/internal/config/config.go index 3b57b5f..e6a3ba0 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -48,8 +48,8 @@ type SiteConfig struct { // ExcludePaths lists paths that should not be scraped ExcludePaths []string `yaml:"exclude_paths"` - // OutputAlias provides an alternative name for output files - OutputAlias string `yaml:"output_alias"` + // FileNamePrefix provides the base name for output files + FileNamePrefix string `yaml:"file_name_prefix"` // PathOverrides allows for path-specific configurations PathOverrides []PathOverride `yaml:"path_overrides"` diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 84f2c75..400fcc3 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -27,7 +27,7 @@ sites: - "/blog" exclude_paths: - "/admin" - output_alias: "example" + file_name_prefix: "example" path_overrides: - path: "/special" css_locator: ".special-content" @@ -71,7 +71,7 @@ burst_limit: 5 ExcludeSelectors: []string{".ads"}, AllowedPaths: []string{"/blog"}, ExcludePaths: []string{"/admin"}, - OutputAlias: "example", + FileNamePrefix: "example", PathOverrides: []PathOverride{ { Path: "/special", diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 3994df3..ec02650 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -8,6 +8,7 @@ import ( "math/rand" "net/url" "os" + "path/filepath" "regexp" "strings" "sync" @@ -47,7 +48,7 @@ type SiteConfig struct { ExcludeSelectors []string AllowedPaths []string ExcludePaths []string - OutputAlias string + FileNamePrefix string PathOverrides []PathOverride } @@ -58,16 +59,18 @@ type PathOverride struct { ExcludeSelectors []string } -func ScrapeSites(config Config) (map[string]string, error) { +func ScrapeSites(config Config) error { logger.Println("Starting ScrapeSites function - Verbose mode is active") results := make(chan struct { url string content string + site SiteConfig // Add site config to track which site the content came from err error }) limiter := rate.NewLimiter(rate.Limit(config.Scrape.RequestsPerSecond), config.Scrape.BurstLimit) - logger.Printf("Rate limiter configured with %f requests per second and burst limit of %d\n", config.Scrape.RequestsPerSecond, config.Scrape.BurstLimit) + logger.Printf("Rate limiter configured with %f requests per second and burst limit of %d\n", + config.Scrape.RequestsPerSecond, config.Scrape.BurstLimit) var wg sync.WaitGroup totalURLs := 0 @@ -91,71 +94,73 @@ func ScrapeSites(config Config) (map[string]string, error) { logger.Println("All goroutines completed, results channel closed") }() - scrapedContent := make(map[string]string) + // Use a map that includes site configuration + scrapedContent := make(map[string]struct { + content string + site SiteConfig + }) + for result := range results { if result.err != nil { logger.Printf("Error scraping %s: %v\n", result.url, result.err) continue } - logger.Printf("Successfully scraped content from %s (length: %d)\n", result.url, len(result.content)) - scrapedContent[result.url] = result.content + logger.Printf("Successfully scraped content from %s (length: %d)\n", + result.url, len(result.content)) + scrapedContent[result.url] = struct { + content string + site SiteConfig + }{ + content: result.content, + site: result.site, + } } logger.Printf("Total URLs processed: %d\n", totalURLs) logger.Printf("Successfully scraped content from %d URLs\n", len(scrapedContent)) - return scrapedContent, nil + return SaveToFiles(scrapedContent, config) } func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { url string content string + site SiteConfig err error -}, limiter *rate.Limiter, -) { +}, limiter *rate.Limiter) { logger.Printf("Starting to scrape URL: %s\n", url) - // Wait for rate limiter before making the request err := limiter.Wait(context.Background()) if err != nil { - logger.Printf("Rate limiter error for %s: %v\n", url, err) results <- struct { url string content string + site SiteConfig err error - }{url, "", fmt.Errorf("rate limiter error: %v", err)} + }{url, "", site, fmt.Errorf("rate limiter error: %v", err)} return } cssLocator, excludeSelectors := getOverrides(url, site) - logger.Printf("Using CSS locator for %s: %s\n", url, cssLocator) - logger.Printf("Exclude selectors for %s: %v\n", url, excludeSelectors) - content, err := scrapeURL(url, cssLocator, excludeSelectors) if err != nil { - logger.Printf("Error scraping %s: %v\n", url, err) results <- struct { url string content string + site SiteConfig err error - }{url, "", err} + }{url, "", site, err} return } - if content == "" { - logger.Printf("Warning: Empty content scraped from %s\n", url) - } else { - logger.Printf("Successfully scraped content from %s (length: %d)\n", url, len(content)) - } - results <- struct { url string content string + site SiteConfig err error - }{url, content, nil} + }{url, content, site, nil} } - func isAllowedURL(urlStr string, site SiteConfig) bool { parsedURL, err := url.Parse(urlStr) if err != nil { @@ -228,9 +233,14 @@ func getFilenameFromContent(content, url string) string { } func sanitizeFilename(name string) string { - // Remove any character that isn't alphanumeric, dash, or underscore - reg, _ := regexp.Compile("[^a-zA-Z0-9-_]+") - return reg.ReplaceAllString(name, "_") + // Replace all non-alphanumeric characters with dashes + reg := regexp.MustCompile("[^a-zA-Z0-9]+") + name = reg.ReplaceAllString(name, "-") + // Remove any leading or trailing dashes + name = strings.Trim(name, "-") + // Collapse multiple consecutive dashes into one + reg = regexp.MustCompile("-+") + return reg.ReplaceAllString(name, "-") } // URLConfig holds configuration for a single URL @@ -238,7 +248,7 @@ type URLConfig struct { URL string CSSLocator string ExcludeSelectors []string - OutputAlias string + FileNamePrefix string } // SetupLogger initializes the logger based on the verbose flag @@ -266,7 +276,7 @@ func InitPlaywright() error { return fmt.Errorf("could not start Playwright: %v", err) } - userAgent := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + userAgent := "Mozilla/5.0 (Linux; Android 15; Pixel 9 Build/AP3A.241105.008) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.6723.106 Mobile Safari/537.36 OPX/2.5" browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{ Args: []string{fmt.Sprintf("--user-agent=%s", userAgent)}, @@ -299,6 +309,119 @@ func CloseBrowser() { ClosePlaywright() } +// SaveToFiles writes the scraped content to files based on output type +func SaveToFiles(content map[string]struct { + content string + site SiteConfig +}, config Config) error { + if config.OutputType == "" { + config.OutputType = "separate" // default to separate files if not specified + } + + switch config.OutputType { + case "single": + if err := os.MkdirAll("output", 0755); err != nil { + return fmt.Errorf("failed to create output directory: %v", err) + } + var combined strings.Builder + for url, data := range content { + combined.WriteString(fmt.Sprintf("## %s\n\n", url)) + combined.WriteString(data.content) + combined.WriteString("\n\n") + } + return os.WriteFile(filepath.Join("output", "combined.md"), []byte(combined.String()), 0644) + + case "separate": + if err := os.MkdirAll("output", 0755); err != nil { + return fmt.Errorf("failed to create output directory: %v", err) + } + + // Group content by site and path + contentBySitePath := make(map[string]map[string]string) + for urlStr, data := range content { + parsedURL, err := url.Parse(urlStr) + if err != nil { + logger.Printf("Warning: Could not parse URL %s: %v", urlStr, err) + continue + } + + // Find matching allowed path for this URL + var matchingPath string + for _, path := range data.site.AllowedPaths { + if strings.HasPrefix(parsedURL.Path, path) { + matchingPath = path + break + } + } + if matchingPath == "" { + logger.Printf("Warning: No matching allowed path for URL %s", urlStr) + continue + } + + siteKey := fmt.Sprintf("%s-%s", data.site.BaseURL, data.site.FileNamePrefix) + if contentBySitePath[siteKey] == nil { + contentBySitePath[siteKey] = make(map[string]string) + } + + // Combine all content for the same path + if existing, exists := contentBySitePath[siteKey][matchingPath]; exists { + contentBySitePath[siteKey][matchingPath] = existing + "\n\n" + data.content + } else { + contentBySitePath[siteKey][matchingPath] = data.content + } + } + + // Write files for each site and path + for siteKey, pathContent := range contentBySitePath { + for path, content := range pathContent { + parts := strings.SplitN(siteKey, "-", 2) // Split only on first hyphen + prefix := parts[1] // Get the FileNamePrefix part + if prefix == "" { + prefix = "doc" // default prefix if none specified + } + + normalizedPath := NormalizePathForFilename(path) + if normalizedPath == "" { + normalizedPath = "index" + } + + filename := filepath.Join("output", fmt.Sprintf("%s-%s.md", + prefix, normalizedPath)) + + // Ensure we don't have empty files + if strings.TrimSpace(content) == "" { + logger.Printf("Skipping empty content for path %s", path) + continue + } + + if err := os.WriteFile(filename, []byte(content), 0644); err != nil { + return fmt.Errorf("failed to write file %s: %v", filename, err) + } + logger.Printf("Wrote content to %s", filename) + } + } + return nil + + default: + return fmt.Errorf("unsupported output type: %s", config.OutputType) + } +} + +// NormalizePathForFilename converts a URL path into a valid filename component +func NormalizePathForFilename(urlPath string) string { + // Remove leading/trailing slashes + path := strings.Trim(urlPath, "/") + // Replace all non-alphanumeric characters with dashes + reg := regexp.MustCompile("[^a-zA-Z0-9]+") + path = reg.ReplaceAllString(path, "-") + // Remove any leading or trailing dashes + path = strings.Trim(path, "-") + // Collapse multiple consecutive dashes into one + reg = regexp.MustCompile("-+") + path = reg.ReplaceAllString(path, "-") + return path +} + // FetchWebpageContent retrieves the content of a webpage using Playwright func FetchWebpageContent(urlStr string) (string, error) { logger.Printf("Fetching webpage content for URL: %s\n", urlStr) @@ -337,7 +460,7 @@ func FetchWebpageContent(urlStr string) (string, error) { } logger.Println("Waiting for body element") - + bodyElement := page.Locator("body") err = bodyElement.WaitFor(playwright.LocatorWaitForOptions{ State: playwright.WaitForSelectorStateVisible, @@ -443,8 +566,7 @@ func scrollPage(page playwright.Page) error { previousHeight = currentHeight // Wait for a while before scrolling again - - + } logger.Println("Scrolling back to top") @@ -458,7 +580,6 @@ func scrollPage(page playwright.Page) error { return nil } - // ExtractContentWithCSS extracts content from HTML using a CSS selector func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) { logger.Printf("Extracting content with CSS selector: %s\n", includeSelector)