flatten scrape config to 'sites:'

* flatten scrape config to 'sites:'. Update unit tests and readme. * remove check for file_extensions configuration. * show progress indication after 5 seconds. * add documentation to functions * fix: remove MaxDepth and link extraction functionality * fix: Remove MaxDepth references from cmd/web.go
2025-12-15 23:13:22 +00:00 · 2024-10-14 16:09:58 -05:00
parent 333b9a366c
commit 02e39baf38
9 changed files with 317 additions and 283 deletions
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -7,35 +7,64 @@ import (
 	"gopkg.in/yaml.v2"
 )

+// Config represents the configuration for the rollup tool
 type Config struct {
-	FileTypes     []string     `yaml:"file_types"`
-	Ignore        []string     `yaml:"ignore"`
-	CodeGenerated []string     `yaml:"code_generated"`
-	Scrape        ScrapeConfig `yaml:"scrape"`
-}
-
-type ScrapeConfig struct {
-    Sites             []SiteConfig `yaml:"sites"`
-    OutputType        string       `yaml:"output_type"`
-    RequestsPerSecond float64      `yaml:"requests_per_second"`
-    BurstLimit        int          `yaml:"burst_limit"`
+	// FileExtensions is a list of file extensions to include in the rollup
+	FileExtensions []string `yaml:"file_extensions"`
+
+	// IgnorePaths is a list of glob patterns for paths to ignore
+	IgnorePaths []string `yaml:"ignore_paths"`
+
+	// CodeGeneratedPaths is a list of glob patterns for code-generated files
+	CodeGeneratedPaths []string `yaml:"code_generated_paths"`
+
+	// Sites is a list of site configurations for web scraping
+	Sites []SiteConfig `yaml:"sites"`
+
+	// OutputType specifies how the output should be generated
+	OutputType string `yaml:"output_type"`
+
+	// RequestsPerSecond limits the rate of web requests
+	RequestsPerSecond *float64 `yaml:"requests_per_second,omitempty"`
+
+	// BurstLimit sets the maximum burst size for rate limiting
+	BurstLimit *int `yaml:"burst_limit,omitempty"`
 }

+// SiteConfig contains configuration for scraping a single site
 type SiteConfig struct {
-    BaseURL          string            `yaml:"base_url"`
-    CSSLocator       string            `yaml:"css_locator"`
-    ExcludeSelectors []string          `yaml:"exclude_selectors"`
-    MaxDepth         int               `yaml:"max_depth"`
-    AllowedPaths     []string          `yaml:"allowed_paths"`
-    ExcludePaths     []string          `yaml:"exclude_paths"`
-    OutputAlias      string            `yaml:"output_alias"`
-    PathOverrides    []PathOverride    `yaml:"path_overrides"`
+	// BaseURL is the starting point for scraping this site
+	BaseURL string `yaml:"base_url"`
+
+	// CSSLocator is used to extract specific content
+	CSSLocator string `yaml:"css_locator"`
+
+	// ExcludeSelectors lists CSS selectors for content to exclude
+	ExcludeSelectors []string `yaml:"exclude_selectors"`
+
+	// AllowedPaths lists paths that are allowed to be scraped
+	AllowedPaths []string `yaml:"allowed_paths"`
+
+	// ExcludePaths lists paths that should not be scraped
+	ExcludePaths []string `yaml:"exclude_paths"`
+
+	// OutputAlias provides an alternative name for output files
+	OutputAlias string `yaml:"output_alias"`
+
+	// PathOverrides allows for path-specific configurations
+	PathOverrides []PathOverride `yaml:"path_overrides"`
 }

+// PathOverride allows for path-specific configurations
 type PathOverride struct {
-    Path             string   `yaml:"path"`
-    CSSLocator       string   `yaml:"css_locator"`
-    ExcludeSelectors []string `yaml:"exclude_selectors"`
+	// Path is the URL path this override applies to
+	Path string `yaml:"path"`
+
+	// CSSLocator overrides the site-wide CSS locator for this path
+	CSSLocator string `yaml:"css_locator"`
+
+	// ExcludeSelectors overrides the site-wide exclude selectors for this path
+	ExcludeSelectors []string `yaml:"exclude_selectors"`
 }

 func Load(configPath string) (*Config, error) {
@@ -50,15 +79,28 @@ func Load(configPath string) (*Config, error) {
 		return nil, fmt.Errorf("error parsing config file: %v", err)
 	}

+	if err := config.Validate(); err != nil {
+		return nil, fmt.Errorf("invalid configuration: %v", err)
+	}
+
 	return &config, nil
 }

-func DefaultConfigPath() string {
-	return "rollup.yml"
-}
+// Validate checks the configuration for any invalid values
+func (c *Config) Validate() error {
+	if c.RequestsPerSecond != nil && *c.RequestsPerSecond <= 0 {
+		return fmt.Errorf("requests_per_second must be positive")
+	}

-func FileExists(filename string) bool {
-	_, err := os.Stat(filename)
-	return err == nil
-}
+	if c.BurstLimit != nil && *c.BurstLimit <= 0 {
+		return fmt.Errorf("burst_limit must be positive")
+	}

+	for _, site := range c.Sites {
+		if site.BaseURL == "" {
+			return fmt.Errorf("base_url must be specified for each site")
+		}
+	}
+
+	return nil
+}
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -9,34 +9,33 @@ import (
 func TestLoad(t *testing.T) {
 	// Create a temporary config file
 	content := []byte(`
-file_types:
-  - go
-  - md
-ignore:
+file_extensions:
+  - .go
+  - .md
+ignore_paths:
  - "*.tmp"
  - "**/*.log"
-code_generated:
+code_generated_paths:
  - "generated_*.go"
-scrape:
-  sites:
-    - base_url: "https://example.com"
-      css_locator: "main"
-      exclude_selectors:
-        - ".ads"
-      max_depth: 2
-      allowed_paths:
-        - "/blog"
-      exclude_paths:
-        - "/admin"
-      output_alias: "example"
-      path_overrides:
-        - path: "/special"
-          css_locator: ".special-content"
-          exclude_selectors:
-            - ".sidebar"
-  output_type: "single"
-  requests_per_second: 1.0
-  burst_limit: 5
+sites:
+  - base_url: "https://example.com"
+    css_locator: "main"
+    exclude_selectors:
+      - ".ads"
+    max_depth: 2
+    allowed_paths:
+      - "/blog"
+    exclude_paths:
+      - "/admin"
+    output_alias: "example"
+    path_overrides:
+      - path: "/special"
+        css_locator: ".special-content"
+        exclude_selectors:
+          - ".sidebar"
+output_type: "single"
+requests_per_second: 1.0
+burst_limit: 5
 `)

 	tmpfile, err := os.CreateTemp("", "config*.yml")
@@ -59,33 +58,33 @@ scrape:
 	}

 	// Check if the loaded config matches the expected values
+	rps := 1.0
+	bl := 5
 	expectedConfig := &Config{
-		FileTypes:     []string{"go", "md"},
-		Ignore:        []string{"*.tmp", "**/*.log"},
-		CodeGenerated: []string{"generated_*.go"},
-		Scrape: ScrapeConfig{
-			Sites: []SiteConfig{
-				{
-					BaseURL:          "https://example.com",
-					CSSLocator:       "main",
-					ExcludeSelectors: []string{".ads"},
-					MaxDepth:         2,
-					AllowedPaths:     []string{"/blog"},
-					ExcludePaths:     []string{"/admin"},
-					OutputAlias:      "example",
-					PathOverrides: []PathOverride{
-						{
-							Path:             "/special",
-							CSSLocator:       ".special-content",
-							ExcludeSelectors: []string{".sidebar"},
-						},
+		FileExtensions:     []string{".go", ".md"},
+		IgnorePaths:        []string{"*.tmp", "**/*.log"},
+		CodeGeneratedPaths: []string{"generated_*.go"},
+		Sites: []SiteConfig{
+			{
+				BaseURL:          "https://example.com",
+				CSSLocator:       "main",
+				ExcludeSelectors: []string{".ads"},
+				MaxDepth:         2,
+				AllowedPaths:     []string{"/blog"},
+				ExcludePaths:     []string{"/admin"},
+				OutputAlias:      "example",
+				PathOverrides: []PathOverride{
+					{
+						Path:             "/special",
+						CSSLocator:       ".special-content",
+						ExcludeSelectors: []string{".sidebar"},
 					},
 				},
 			},
-			OutputType:        "single",
-			RequestsPerSecond: 1.0,
-			BurstLimit:        5,
 		},
+		OutputType:        "single",
+		RequestsPerSecond: &rps,
+		BurstLimit:        &bl,
 	}

 	if !reflect.DeepEqual(config, expectedConfig) {
@@ -93,28 +92,67 @@ scrape:
 	}
 }

-func TestDefaultConfigPath(t *testing.T) {
-	expected := "rollup.yml"
-	result := DefaultConfigPath()
-	if result != expected {
-		t.Errorf("DefaultConfigPath() = %q, want %q", result, expected)
-	}
-}
-
-func TestFileExists(t *testing.T) {
-	// Test with an existing file
-	tmpfile, err := os.CreateTemp("", "testfile")
-	if err != nil {
-		t.Fatalf("Failed to create temp file: %v", err)
-	}
-	defer os.Remove(tmpfile.Name())
-
-	if !FileExists(tmpfile.Name()) {
-		t.Errorf("FileExists(%q) = false, want true", tmpfile.Name())
-	}
-
-	// Test with a non-existing file
-	if FileExists("non_existing_file.txt") {
-		t.Errorf("FileExists(\"non_existing_file.txt\") = true, want false")
+func TestValidate(t *testing.T) {
+	tests := []struct {
+		name    string
+		config  Config
+		wantErr bool
+	}{
+		{
+			name: "Valid config",
+			config: Config{
+				FileExtensions: []string{".go"},
+				Sites: []SiteConfig{
+					{BaseURL: "https://example.com", MaxDepth: 2},
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name:    "No file extensions",
+			config:  Config{},
+			wantErr: true,
+		},
+		{
+			name: "Invalid requests per second",
+			config: Config{
+				FileExtensions:    []string{".go"},
+				RequestsPerSecond: func() *float64 { f := -1.0; return &f }(),
+			},
+			wantErr: true,
+		},
+		{
+			name: "Invalid burst limit",
+			config: Config{
+				FileExtensions: []string{".go"},
+				BurstLimit:     func() *int { i := -1; return &i }(),
+			},
+			wantErr: true,
+		},
+		{
+			name: "Site without base URL",
+			config: Config{
+				FileExtensions: []string{".go"},
+				Sites:          []SiteConfig{{}},
+			},
+			wantErr: true,
+		},
+		{
+			name: "Negative max depth",
+			config: Config{
+				FileExtensions: []string{".go"},
+				Sites:          []SiteConfig{{BaseURL: "https://example.com", MaxDepth: -1}},
+			},
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := tt.config.Validate()
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Validate() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
 	}
 }
--- a/internal/scraper/scraper.go
+++ b/internal/scraper/scraper.go
@@ -45,7 +45,6 @@ type SiteConfig struct {
 	BaseURL          string
 	CSSLocator       string
 	ExcludeSelectors []string
-	MaxDepth         int
 	AllowedPaths     []string
 	ExcludePaths     []string
 	OutputAlias      string
@@ -156,57 +155,6 @@ func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
 	}{url, content, nil}
 }

-func scrapeSite(site SiteConfig, results chan<- struct {
-	url     string
-	content string
-	err     error
-}, limiter *rate.Limiter,
-) {
-	visited := make(map[string]bool)
-	queue := []string{site.BaseURL}
-
-	for len(queue) > 0 {
-		url := queue[0]
-		queue = queue[1:]
-
-		if visited[url] {
-			continue
-		}
-		visited[url] = true
-
-		if !isAllowedURL(url, site) {
-			continue
-		}
-
-		// Wait for rate limiter before making the request
-		err := limiter.Wait(context.Background())
-		if err != nil {
-			results <- struct {
-				url     string
-				content string
-				err     error
-			}{url, "", fmt.Errorf("rate limiter error: %v", err)}
-			continue
-		}
-
-		cssLocator, excludeSelectors := getOverrides(url, site)
-		content, err := scrapeURL(url, cssLocator, excludeSelectors)
-		results <- struct {
-			url     string
-			content string
-			err     error
-		}{url, content, err}
-
-		if len(visited) < site.MaxDepth {
-			links, _ := ExtractLinks(url)
-			for _, link := range links {
-				if !visited[link] && isAllowedURL(link, site) {
-					queue = append(queue, link)
-				}
-			}
-		}
-	}
-}

 func isAllowedURL(urlStr string, site SiteConfig) bool {
 	parsedURL, err := url.Parse(urlStr)
@@ -510,40 +458,6 @@ func scrollPage(page playwright.Page) error {
 	return nil
 }

-// ExtractLinks extracts all links from the given URL
-func ExtractLinks(urlStr string) ([]string, error) {
-	logger.Printf("Extracting links from URL: %s\n", urlStr)
-
-	page, err := browser.NewPage()
-	if err != nil {
-		return nil, fmt.Errorf("could not create page: %v", err)
-	}
-	defer page.Close()
-
-	if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
-		WaitUntil: playwright.WaitUntilStateNetworkidle,
-	}); err != nil {
-		return nil, fmt.Errorf("could not go to page: %v", err)
-	}
-
-	links, err := page.Evaluate(`() => {
-		const anchors = document.querySelectorAll('a');
-		return Array.from(anchors).map(a => a.href);
-	}`)
-	if err != nil {
-		return nil, fmt.Errorf("could not extract links: %v", err)
-	}
-
-	var result []string
-	for _, link := range links.([]interface{}) {
-		// Normalize URL by removing trailing slash
-		normalizedLink := strings.TrimRight(link.(string), "/")
-		result = append(result, normalizedLink)
-	}
-
-	logger.Printf("Extracted %d links\n", len(result))
-	return result, nil
-}

 // ExtractContentWithCSS extracts content from HTML using a CSS selector
 func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) {