Fix logging and other issues from preventing scraping

2025-12-15 06:53:18 +00:00 · 2024-09-21 15:54:33 -05:00
parent 5482621d99
commit 73116e8d82
9 changed files with 975 additions and 97 deletions
--- a/cmd/files_test.go
+++ b/cmd/files_test.go
@@ -0,0 +1,147 @@
+package cmd
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/tnypxl/rollup/internal/config"
+)
+
+func TestMatchGlob(t *testing.T) {
+	tests := []struct {
+		pattern  string
+		path     string
+		expected bool
+	}{
+		{"*.go", "file.go", true},
+		{"*.go", "file.txt", false},
+		{"**/*.go", "dir/file.go", true},
+		{"**/*.go", "dir/subdir/file.go", true},
+		{"dir/*.go", "dir/file.go", true},
+		{"dir/*.go", "otherdir/file.go", false},
+	}
+
+	for _, test := range tests {
+		result := matchGlob(test.pattern, test.path)
+		if result != test.expected {
+			t.Errorf("matchGlob(%q, %q) = %v; want %v", test.pattern, test.path, result, test.expected)
+		}
+	}
+}
+
+func TestIsCodeGenerated(t *testing.T) {
+	patterns := []string{"generated_*.go", "**/auto_*.go"}
+	tests := []struct {
+		path     string
+		expected bool
+	}{
+		{"generated_file.go", true},
+		{"normal_file.go", false},
+		{"subdir/auto_file.go", true},
+		{"subdir/normal_file.go", false},
+	}
+
+	for _, test := range tests {
+		result := isCodeGenerated(test.path, patterns)
+		if result != test.expected {
+			t.Errorf("isCodeGenerated(%q, %v) = %v; want %v", test.path, patterns, result, test.expected)
+		}
+	}
+}
+
+func TestIsIgnored(t *testing.T) {
+	patterns := []string{"*.tmp", "**/*.log"}
+	tests := []struct {
+		path     string
+		expected bool
+	}{
+		{"file.tmp", true},
+		{"file.go", false},
+		{"subdir/file.log", true},
+		{"subdir/file.txt", false},
+	}
+
+	for _, test := range tests {
+		result := isIgnored(test.path, patterns)
+		if result != test.expected {
+			t.Errorf("isIgnored(%q, %v) = %v; want %v", test.path, patterns, result, test.expected)
+		}
+	}
+}
+
+func TestRunRollup(t *testing.T) {
+	// Create a temporary directory for testing
+	tempDir, err := os.MkdirTemp("", "rollup_test")
+	if err != nil {
+		t.Fatalf("Failed to create temp dir: %v", err)
+	}
+	defer os.RemoveAll(tempDir)
+
+	// Create some test files
+	files := map[string]string{
+		"file1.go":          "package main\n\nfunc main() {}\n",
+		"file2.txt":         "This is a text file.\n",
+		"subdir/file3.go":   "package subdir\n\nfunc Func() {}\n",
+		"subdir/file4.json": "{\"key\": \"value\"}\n",
+	}
+
+	for name, content := range files {
+		path := filepath.Join(tempDir, name)
+		err := os.MkdirAll(filepath.Dir(path), 0755)
+		if err != nil {
+			t.Fatalf("Failed to create directory: %v", err)
+		}
+		err = os.WriteFile(path, []byte(content), 0644)
+		if err != nil {
+			t.Fatalf("Failed to write file: %v", err)
+		}
+	}
+
+	// Set up test configuration
+	cfg = &config.Config{
+		FileTypes:     []string{"go", "txt"},
+		Ignore:        []string{"*.json"},
+		CodeGenerated: []string{},
+	}
+	path = tempDir
+
+	// Run the rollup
+	err = runRollup()
+	if err != nil {
+		t.Fatalf("runRollup() failed: %v", err)
+	}
+
+	// Check if the output file was created
+	outputFiles, err := filepath.Glob(filepath.Join(tempDir, "*.rollup.md"))
+	if err != nil {
+		t.Fatalf("Failed to glob output files: %v", err)
+	}
+	if len(outputFiles) != 1 {
+		t.Fatalf("Expected 1 output file, got %d", len(outputFiles))
+	}
+
+	// Read the content of the output file
+	content, err := os.ReadFile(outputFiles[0])
+	if err != nil {
+		t.Fatalf("Failed to read output file: %v", err)
+	}
+
+	// Check if the content includes the expected files
+	expectedContent := []string{
+		"# File: file1.go",
+		"# File: file2.txt",
+		"# File: subdir/file3.go",
+	}
+	for _, expected := range expectedContent {
+		if !strings.Contains(string(content), expected) {
+			t.Errorf("Output file does not contain expected content: %s", expected)
+		}
+	}
+
+	// Check if the ignored file is not included
+	if strings.Contains(string(content), "file4.json") {
+		t.Errorf("Output file contains ignored file: file4.json")
+	}
+}
--- a/cmd/web.go
+++ b/cmd/web.go
@@ -2,6 +2,8 @@ package cmd

 import (
 	"fmt"
+	"io/ioutil"
+	"log"
 	"net/url"
 	"os"
 	"regexp"
@@ -9,6 +11,7 @@ import (
 	"time"

 	"github.com/spf13/cobra"
+	"github.com/tnypxl/rollup/internal/config"
 	"github.com/tnypxl/rollup/internal/scraper"
 )

@@ -38,47 +41,93 @@ func init() {
 }

 func runWeb(cmd *cobra.Command, args []string) error {
-	scraperConfig.Verbose = verbose
+    scraper.SetupLogger(verbose)
+    logger := log.New(os.Stdout, "WEB: ", log.LstdFlags)
+    if !verbose {
+        logger.SetOutput(ioutil.Discard)
+    }
+    logger.Printf("Starting web scraping process with verbose mode: %v", verbose)
+    scraperConfig.Verbose = verbose

-	// Use config if available, otherwise use command-line flags
-	var urlConfigs []scraper.URLConfig
-	if len(urls) == 0 && len(cfg.Scrape.URLs) > 0 {
-		urlConfigs = make([]scraper.URLConfig, len(cfg.Scrape.URLs))
-		for i, u := range cfg.Scrape.URLs {
-			urlConfigs[i] = scraper.URLConfig{
-				URL:              u.URL,
-				CSSLocator:       u.CSSLocator,
-				ExcludeSelectors: u.ExcludeSelectors,
-				OutputAlias:      u.OutputAlias,
-			}
-		}
-	} else {
-		urlConfigs = make([]scraper.URLConfig, len(urls))
-		for i, u := range urls {
-			urlConfigs[i] = scraper.URLConfig{URL: u, CSSLocator: includeSelector}
-		}
-	}
+    var siteConfigs []scraper.SiteConfig
+    if len(cfg.Scrape.Sites) > 0 {
+        logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites))
+        siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites))
+        for i, site := range cfg.Scrape.Sites {
+            siteConfigs[i] = scraper.SiteConfig{
+                BaseURL:          site.BaseURL,
+                CSSLocator:       site.CSSLocator,
+                ExcludeSelectors: site.ExcludeSelectors,
+                MaxDepth:         site.MaxDepth,
+                AllowedPaths:     site.AllowedPaths,
+                ExcludePaths:     site.ExcludePaths,
+                OutputAlias:      site.OutputAlias,
+                PathOverrides:    convertPathOverrides(site.PathOverrides),
+            }
+            logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d, AllowedPaths=%v",
+                       i+1, site.BaseURL, site.CSSLocator, site.MaxDepth, site.AllowedPaths)
+        }
+    } else {
+        logger.Printf("No sites defined in rollup.yml, falling back to URL-based configuration")
+        siteConfigs = make([]scraper.SiteConfig, len(urls))
+        for i, u := range urls {
+            siteConfigs[i] = scraper.SiteConfig{
+                BaseURL:          u,
+                CSSLocator:       includeSelector,
+                ExcludeSelectors: excludeSelectors,
+                MaxDepth:         depth,
+            }
+            logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d",
+                       i+1, u, includeSelector, depth)
+        }
+    }

-	if len(urlConfigs) == 0 {
-		return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.urls' in the rollup.yml file")
-	}
+    if len(siteConfigs) == 0 {
+        logger.Println("Error: No sites or URLs provided")
+        return fmt.Errorf("no sites or URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.sites' in the rollup.yml file")
+    }

-	scraperConfig := scraper.Config{
-		URLs:       urlConfigs,
-		OutputType: outputType,
-		Verbose:    verbose,
-	}
+    // Set default values for rate limiting
+    defaultRequestsPerSecond := 1.0
+    defaultBurstLimit := 3

-	scrapedContent, err := scraper.ScrapeMultipleURLs(scraperConfig)
-	if err != nil {
-		return fmt.Errorf("error scraping content: %v", err)
-	}
+    // Use default values if not set in the configuration
+    requestsPerSecond := cfg.Scrape.RequestsPerSecond
+    if requestsPerSecond == 0 {
+        requestsPerSecond = defaultRequestsPerSecond
+    }
+    burstLimit := cfg.Scrape.BurstLimit
+    if burstLimit == 0 {
+        burstLimit = defaultBurstLimit
+    }

-	if outputType == "single" {
-		return writeSingleFile(scrapedContent)
-	} else {
-		return writeMultipleFiles(scrapedContent)
-	}
+    scraperConfig := scraper.Config{
+        Sites:      siteConfigs,
+        OutputType: outputType,
+        Verbose:    verbose,
+        Scrape: scraper.ScrapeConfig{
+            RequestsPerSecond: requestsPerSecond,
+            BurstLimit:        burstLimit,
+        },
+    }
+    logger.Printf("Scraper configuration: OutputType=%s, RequestsPerSecond=%f, BurstLimit=%d",
+               outputType, requestsPerSecond, burstLimit)
+
+    logger.Println("Starting scraping process")
+    scrapedContent, err := scraper.ScrapeSites(scraperConfig)
+    if err != nil {
+        logger.Printf("Error occurred during scraping: %v", err)
+        return fmt.Errorf("error scraping content: %v", err)
+    }
+    logger.Printf("Scraping completed. Total content scraped: %d", len(scrapedContent))
+
+    if outputType == "single" {
+        logger.Println("Writing content to a single file")
+        return writeSingleFile(scrapedContent)
+    } else {
+        logger.Println("Writing content to multiple files")
+        return writeMultipleFiles(scrapedContent)
+    }
 }

 func writeSingleFile(content map[string]string) error {
@@ -102,20 +151,26 @@ func writeSingleFile(content map[string]string) error {

 func writeMultipleFiles(content map[string]string) error {
 	for url, c := range content {
-		filename := getFilenameFromContent(c, url)
+		filename, err := getFilenameFromContent(c, url)
+		if err != nil {
+			return fmt.Errorf("error generating filename for %s: %v", url, err)
+		}
+
 		file, err := os.Create(filename)
 		if err != nil {
 			return fmt.Errorf("error creating output file %s: %v", filename, err)
 		}

-		_, err = fmt.Fprintf(file, "# Content from %s\n\n%s", url, c)
-		file.Close()
+		_, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s\n", url, c))
 		if err != nil {
+			file.Close()
 			return fmt.Errorf("error writing content to file %s: %v", filename, err)
 		}

+		file.Close()
 		fmt.Printf("Content from %s has been saved to %s\n", url, filename)
 	}
+
 	return nil
 }

@@ -136,13 +191,13 @@ func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error

 	visited[urlStr] = true

-	content, err := extractAndConvertContent(urlStr)
+	content, err := testExtractAndConvertContent(urlStr)
 	if err != nil {
 		return "", err
 	}

 	if depth > 0 {
-		links, err := scraper.ExtractLinks(urlStr)
+		links, err := testExtractLinks(urlStr)
 		if err != nil {
 			return content, fmt.Errorf("error extracting links: %v", err)
 		}
@@ -160,6 +215,9 @@ func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error
 	return content, nil
 }

+var testExtractAndConvertContent = extractAndConvertContent
+var testExtractLinks = scraper.ExtractLinks
+
 func extractAndConvertContent(urlStr string) (string, error) {
 	content, err := scraper.FetchWebpageContent(urlStr)
 	if err != nil {
@@ -187,17 +245,32 @@ func extractAndConvertContent(urlStr string) (string, error) {
 	return header + markdown + "\n\n", nil
 }

-func getFilenameFromContent(content, url string) string {
+func getFilenameFromContent(content, urlStr string) (string, error) {
 	// Try to extract title from content
 	titleStart := strings.Index(content, "<title>")
 	titleEnd := strings.Index(content, "</title>")
 	if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart {
-		title := content[titleStart+7 : titleEnd]
-		return sanitizeFilename(title) + ".md"
+		title := strings.TrimSpace(content[titleStart+7 : titleEnd])
+		if title != "" {
+			return sanitizeFilename(title) + ".rollup.md", nil
+		}
 	}

-	// If no title found, use the URL
-	return sanitizeFilename(url) + ".md"
+	// If no title found or title is empty, use the URL
+	parsedURL, err := url.Parse(urlStr)
+	if err != nil {
+		return "", fmt.Errorf("invalid URL: %v", err)
+	}
+
+	if parsedURL.Host == "" {
+		return "", fmt.Errorf("invalid URL: missing host")
+	}
+
+	filename := parsedURL.Host
+	if parsedURL.Path != "" && parsedURL.Path != "/" {
+		filename += strings.TrimSuffix(parsedURL.Path, "/")
+	}
+	return sanitizeFilename(filename) + ".rollup.md", nil
 }

 func sanitizeFilename(name string) string {
@@ -215,3 +288,15 @@ func sanitizeFilename(name string) string {

 	return name
 }
+
+func convertPathOverrides(configOverrides []config.PathOverride) []scraper.PathOverride {
+	scraperOverrides := make([]scraper.PathOverride, len(configOverrides))
+	for i, override := range configOverrides {
+		scraperOverrides[i] = scraper.PathOverride{
+			Path:             override.Path,
+			CSSLocator:       override.CSSLocator,
+			ExcludeSelectors: override.ExcludeSelectors,
+		}
+	}
+	return scraperOverrides
+}
--- a/cmd/web_test.go
+++ b/cmd/web_test.go
@@ -0,0 +1,154 @@
+package cmd
+
+import (
+	"testing"
+	"strings"
+	"github.com/tnypxl/rollup/internal/config"
+)
+
+func TestConvertPathOverrides(t *testing.T) {
+	configOverrides := []config.PathOverride{
+		{
+			Path:             "/blog",
+			CSSLocator:       "article",
+			ExcludeSelectors: []string{".ads", ".comments"},
+		},
+		{
+			Path:             "/products",
+			CSSLocator:       ".product-description",
+			ExcludeSelectors: []string{".related-items"},
+		},
+	}
+
+	scraperOverrides := convertPathOverrides(configOverrides)
+
+	if len(scraperOverrides) != len(configOverrides) {
+		t.Errorf("Expected %d overrides, got %d", len(configOverrides), len(scraperOverrides))
+	}
+
+	for i, override := range scraperOverrides {
+		if override.Path != configOverrides[i].Path {
+			t.Errorf("Expected Path %s, got %s", configOverrides[i].Path, override.Path)
+		}
+		if override.CSSLocator != configOverrides[i].CSSLocator {
+			t.Errorf("Expected CSSLocator %s, got %s", configOverrides[i].CSSLocator, override.CSSLocator)
+		}
+		if len(override.ExcludeSelectors) != len(configOverrides[i].ExcludeSelectors) {
+			t.Errorf("Expected %d ExcludeSelectors, got %d", len(configOverrides[i].ExcludeSelectors), len(override.ExcludeSelectors))
+		}
+		for j, selector := range override.ExcludeSelectors {
+			if selector != configOverrides[i].ExcludeSelectors[j] {
+				t.Errorf("Expected ExcludeSelector %s, got %s", configOverrides[i].ExcludeSelectors[j], selector)
+			}
+		}
+	}
+}
+
+func TestSanitizeFilename(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected string
+	}{
+		{"Hello, World!", "Hello_World"},
+		{"file/with/path", "file_with_path"},
+		{"file.with.dots", "file_with_dots"},
+		{"___leading_underscores___", "leading_underscores"},
+		{"", "untitled"},
+		{"!@#$%^&*()", "untitled"},
+	}
+
+	for _, test := range tests {
+		result := sanitizeFilename(test.input)
+		if result != test.expected {
+			t.Errorf("sanitizeFilename(%q) = %q; want %q", test.input, result, test.expected)
+		}
+	}
+}
+
+func TestGetFilenameFromContent(t *testing.T) {
+	tests := []struct {
+		content  string
+		url      string
+		expected string
+		expectErr bool
+	}{
+		{"<title>Test Page</title>", "http://example.com", "Test_Page.rollup.md", false},
+		{"No title here", "http://example.com/page", "example_com_page.rollup.md", false},
+		{"<title>  Trim  Me  </title>", "http://example.com", "Trim_Me.rollup.md", false},
+		{"<title></title>", "http://example.com", "example_com.rollup.md", false},
+		{"<title>   </title>", "http://example.com", "example_com.rollup.md", false},
+		{"Invalid URL", "not a valid url", "", true},
+		{"No host", "http://", "", true},
+	}
+
+	for _, test := range tests {
+		result, err := getFilenameFromContent(test.content, test.url)
+		if test.expectErr {
+			if err == nil {
+				t.Errorf("getFilenameFromContent(%q, %q) expected an error, but got none", test.content, test.url)
+			}
+		} else {
+			if err != nil {
+				t.Errorf("getFilenameFromContent(%q, %q) unexpected error: %v", test.content, test.url, err)
+			}
+			if result != test.expected {
+				t.Errorf("getFilenameFromContent(%q, %q) = %q; want %q", test.content, test.url, result, test.expected)
+			}
+		}
+	}
+}
+
+// Mock functions for testing
+func mockExtractAndConvertContent(urlStr string) (string, error) {
+	return "Mocked content for " + urlStr, nil
+}
+
+func mockExtractLinks(urlStr string) ([]string, error) {
+	return []string{"http://example.com/link1", "http://example.com/link2"}, nil
+}
+
+func TestScrapeURL(t *testing.T) {
+	// Store the original functions
+	originalExtractAndConvertContent := testExtractAndConvertContent
+	originalExtractLinks := testExtractLinks
+
+	// Define mock functions
+	testExtractAndConvertContent = func(urlStr string) (string, error) {
+		return "Mocked content for " + urlStr, nil
+	}
+	testExtractLinks = func(urlStr string) ([]string, error) {
+		return []string{"http://example.com/link1", "http://example.com/link2"}, nil
+	}
+
+	// Defer the restoration of original functions
+	defer func() {
+		testExtractAndConvertContent = originalExtractAndConvertContent
+		testExtractLinks = originalExtractLinks
+	}()
+
+	tests := []struct {
+		url           string
+		depth         int
+		expectedCalls int
+	}{
+		{"http://example.com", 0, 1},
+		{"http://example.com", 1, 3},
+		{"http://example.com", 2, 3}, // Same as depth 1 because our mock only returns 2 links
+	}
+
+	for _, test := range tests {
+		visited := make(map[string]bool)
+		content, err := scrapeURL(test.url, test.depth, visited)
+		if err != nil {
+			t.Errorf("scrapeURL(%q, %d) returned error: %v", test.url, test.depth, err)
+			continue
+		}
+		if len(visited) != test.expectedCalls {
+			t.Errorf("scrapeURL(%q, %d) made %d calls, expected %d", test.url, test.depth, len(visited), test.expectedCalls)
+		}
+		expectedContent := "Mocked content for " + test.url
+		if !strings.Contains(content, expectedContent) {
+			t.Errorf("scrapeURL(%q, %d) content doesn't contain %q", test.url, test.depth, expectedContent)
+		}
+	}
+}