docs: update configuration section in README.md to include scrape parameters and example usage

fix: set default values for requests_per_second and burst_limit in configuration to prevent rate limiter errors
refactor: remove redundant scraping functions and update runWeb to utilize scraper.ScrapeSites for improved maintainability
2025-12-13 06:23:18 +00:00 · 2024-09-30 14:20:17 -05:00 · 2024-09-30 14:19:00 -05:00 · 2024-09-30 14:10:37 -05:00 · 2024-09-30 14:08:16 -05:00 · 2024-09-30 14:05:10 -05:00
13 changed files with 1956 additions and 42 deletions
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Rollup aggregates the contents of text-based files and webpages into a markdown
 - Flexible configuration file support (YAML)
 - Automatic generation of default configuration file
 - Custom output file naming
- Rate limiting for web scraping to respect server resources
+- Concurrent processing for improved performance

 ## Installation

@@ -63,39 +63,37 @@ rollup [command] [flags]

 Rollup can be configured using a YAML file. By default, it looks for `rollup.yml` in the current directory. You can specify a different configuration file using the `--config` flag.

-Example `rollup.yml`:
+**Scrape Configuration Parameters:**
+
+- `requests_per_second`: *(float, optional)* The rate at which requests are made per second during web scraping. Default is `1.0`.
+- `burst_limit`: *(integer, optional)* The maximum number of requests that can be made in a burst. Default is `5`.
+
+These parameters help control the request rate to avoid overloading the target servers and to comply with their rate limits.
+
+**Example `rollup.yml` with Scrape Configuration:**

 ```yaml
-file_extensions:
-  - go
-  - md
-ignore_paths:
-  - node_modules/**
-  - vendor/**
-  - .git/**
-code_generated_paths:
-  - **/generated/**
-sites:
-  - base_url: https://example.com
-    css_locator: .content
-    exclude_selectors:
-      - .ads
-      - .navigation
-    max_depth: 2
-    allowed_paths:
-      - /blog
-      - /docs
-    exclude_paths:
-      - /admin
-    output_alias: example
-    path_overrides:
-      - path: /special-page
-        css_locator: .special-content
-        exclude_selectors:
-          - .special-ads
-output_type: single
-requests_per_second: 1.0
-burst_limit: 3
+scrape:
+  requests_per_second: 1.0
+  burst_limit: 5
+  sites:
+    - base_url: https://example.com
+      css_locator: .content
+      exclude_selectors:
+        - .ads
+        - .navigation
+      max_depth: 2
+      allowed_paths:
+        - /blog
+        - /docs
+      exclude_paths:
+        - /admin
+      output_alias: example
+      path_overrides:
+        - path: /special-page
+          css_locator: .special-content
+          exclude_selectors:
+            - .special-ads
 ```

 ## Examples
@@ -106,10 +104,10 @@ burst_limit: 3
   rollup files
   ```

-2. Web scraping with multiple URLs:
+2. Web scraping with multiple URLs and increased concurrency:

   ```bash
-   rollup web --urls=https://example.com,https://another-example.com
+   rollup web --urls=https://example.com,https://another-example.com --concurrent=8
   ```

 3. Generate a default configuration file:
@@ -118,22 +116,20 @@ burst_limit: 3
   rollup generate
   ```

-4. Use a custom configuration file:
+4. Use a custom configuration file and specify output:

   ```bash
-   rollup files --config=my-config.yml
+   rollup files --config=my-config.yml --output=project_summary.md
   ```

-5. Web scraping with separate output files:
-
+5. Web scraping with separate output files and custom timeout:
   ```bash
-   rollup web --urls=https://example.com,https://another-example.com --output=separate
+   rollup web --urls=https://example.com,https://another-example.com --output=separate --timeout=60
   ```

 6. Rollup files with specific types and ignore patterns:
-
   ```bash
-   rollup files --types=go,md --ignore=vendor/**,*_test.go
+   rollup files --types=.go,.md --ignore=vendor/**,*_test.go
   ```

 7. Web scraping with depth and CSS selector:
--- a/cmd/files.go
+++ b/cmd/files.go
@@ -0,0 +1,196 @@
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/spf13/cobra"
+	"github.com/tnypxl/rollup/internal/config"
+)
+
+var cfg *config.Config
+
+var (
+	path            string
+	fileTypes       string
+	codeGenPatterns string
+	ignorePatterns  string
+)
+
+var filesCmd = &cobra.Command{
+	Use:   "files",
+	Short: "Rollup files into a single Markdown file",
+	Long: `The files subcommand writes the contents of all files (with target custom file types provided)
+in a given project, current path or a custom path, to a single timestamped markdown file
+whose name is <project-directory-name>-rollup-<timestamp>.md.`,
+	RunE: func(cmd *cobra.Command, args []string) error {
+		return runRollup(cfg)
+	},
+}
+
+func init() {
+	filesCmd.Flags().StringVarP(&path, "path", "p", ".", "Path to the project directory")
+	filesCmd.Flags().StringVarP(&fileTypes, "types", "t", ".go,.md,.txt", "Comma-separated list of file extensions to include")
+	filesCmd.Flags().StringVarP(&codeGenPatterns, "codegen", "g", "", "Comma-separated list of glob patterns for code-generated files")
+	filesCmd.Flags().StringVarP(&ignorePatterns, "ignore", "i", "", "Comma-separated list of glob patterns for files to ignore")
+}
+
+func matchGlob(pattern, path string) bool {
+	parts := strings.Split(pattern, "/")
+	return matchGlobRecursive(parts, path)
+}
+
+func matchGlobRecursive(patternParts []string, path string) bool {
+	if len(patternParts) == 0 {
+		return path == ""
+	}
+
+	if patternParts[0] == "**" {
+		for i := 0; i <= len(path); i++ {
+			if matchGlobRecursive(patternParts[1:], path[i:]) {
+				return true
+			}
+		}
+		return false
+	}
+
+	i := strings.IndexByte(path, '/')
+	if i < 0 {
+		matched, _ := filepath.Match(patternParts[0], path)
+		return matched && len(patternParts) == 1
+	}
+
+	matched, _ := filepath.Match(patternParts[0], path[:i])
+	return matched && matchGlobRecursive(patternParts[1:], path[i+1:])
+}
+
+func isCodeGenerated(filePath string, patterns []string) bool {
+	for _, pattern := range patterns {
+		if strings.Contains(pattern, "**") {
+			if matchGlob(pattern, filePath) {
+				return true
+			}
+		} else {
+			matched, err := filepath.Match(pattern, filepath.Base(filePath))
+			if err == nil && matched {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+func isIgnored(filePath string, patterns []string) bool {
+	for _, pattern := range patterns {
+		if strings.Contains(pattern, "**") {
+			if matchGlob(pattern, filePath) {
+				return true
+			}
+		} else {
+			// Check if the pattern matches the full path or any part of it
+			if matched, _ := filepath.Match(pattern, filePath); matched {
+				return true
+			}
+			pathParts := strings.Split(filePath, string(os.PathSeparator))
+			for i := range pathParts {
+				partialPath := filepath.Join(pathParts[:i+1]...)
+				if matched, _ := filepath.Match(pattern, partialPath); matched {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
+
+func runRollup(cfg *config.Config) error {
+	// Use config if available, otherwise use command-line flags
+	var types, codeGenList, ignoreList []string
+	if cfg != nil && len(cfg.FileTypes) > 0 {
+		types = cfg.FileTypes
+	} else {
+		types = strings.Split(fileTypes, ",")
+	}
+	if cfg != nil && len(cfg.CodeGenerated) > 0 {
+		codeGenList = cfg.CodeGenerated
+	} else {
+		codeGenList = strings.Split(codeGenPatterns, ",")
+	}
+	if cfg != nil && cfg.Ignore != nil && len(cfg.Ignore) > 0 {
+		ignoreList = cfg.Ignore
+	} else {
+		ignoreList = strings.Split(ignorePatterns, ",")
+	}
+
+	// Get the absolute path
+	absPath, err := filepath.Abs(path)
+	if err != nil {
+		return fmt.Errorf("error getting absolute path: %v", err)
+	}
+
+	// Get the project directory name
+	projectName := filepath.Base(absPath)
+
+	// Generate the output file name
+	timestamp := time.Now().Format("20060102-150405")
+	outputFileName := fmt.Sprintf("%s-%s.rollup.md", projectName, timestamp)
+
+	// Open the output file
+	outputFile, err := os.Create(outputFileName)
+	if err != nil {
+		return fmt.Errorf("error creating output file: %v", err)
+	}
+	defer outputFile.Close()
+
+	// Walk through the directory
+	err = filepath.Walk(absPath, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if info.IsDir() {
+			if strings.HasPrefix(info.Name(), ".") {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+		relPath, _ := filepath.Rel(absPath, path)
+
+		// Check if the file should be ignored
+		if isIgnored(relPath, ignoreList) {
+			return nil
+		}
+
+		ext := filepath.Ext(path)
+		for _, t := range types {
+			if ext == "."+t {
+				// Read file contents
+				content, err := os.ReadFile(path)
+				if err != nil {
+					fmt.Printf("Error reading file %s: %v", path, err)
+					return nil
+				}
+
+				// Check if the file is code-generated
+				isCodeGen := isCodeGenerated(relPath, codeGenList)
+				codeGenNote := ""
+				if isCodeGen {
+					codeGenNote = " (Code-generated, Read-only)"
+				}
+
+				// Write file name and contents to the output file
+				fmt.Fprintf(outputFile, "# File: %s%s\n\n```%s\n%s```\n\n", relPath, codeGenNote, t, string(content))
+				break
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return fmt.Errorf("error walking through directory: %v", err)
+	}
+
+	fmt.Printf("Rollup complete. Output file: %s", outputFileName)
+	return nil
+}
--- a/cmd/files_test.go
+++ b/cmd/files_test.go
@@ -0,0 +1,172 @@
+package cmd
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/tnypxl/rollup/internal/config"
+)
+
+func TestMatchGlob(t *testing.T) {
+	tests := []struct {
+		pattern  string
+		path     string
+		expected bool
+	}{
+		{"*.go", "file.go", true},
+		{"*.go", "file.txt", false},
+		{"**/*.go", "dir/file.go", true},
+		{"**/*.go", "dir/subdir/file.go", true},
+		{"dir/*.go", "dir/file.go", true},
+		{"dir/*.go", "otherdir/file.go", false},
+		{"**/test_*.go", "internal/test_helper.go", true},
+		{"docs/**/*.md", "docs/api/endpoints.md", true},
+		{"docs/**/*.md", "src/docs/readme.md", false},
+	}
+
+	for _, test := range tests {
+		result := matchGlob(test.pattern, test.path)
+		if result != test.expected {
+			t.Errorf("matchGlob(%q, %q) = %v; want %v", test.pattern, test.path, result, test.expected)
+		}
+	}
+}
+
+func TestIsCodeGenerated(t *testing.T) {
+	patterns := []string{"generated_*.go", "**/auto_*.go", "**/*_gen.go"}
+	tests := []struct {
+		path     string
+		expected bool
+	}{
+		{"generated_file.go", true},
+		{"normal_file.go", false},
+		{"subdir/auto_file.go", true},
+		{"subdir/normal_file.go", false},
+		{"pkg/models_gen.go", true},
+		{"pkg/handler.go", false},
+	}
+
+	for _, test := range tests {
+		result := isCodeGenerated(test.path, patterns)
+		if result != test.expected {
+			t.Errorf("isCodeGenerated(%q, %v) = %v; want %v", test.path, patterns, result, test.expected)
+		}
+	}
+}
+
+func TestIsIgnored(t *testing.T) {
+	patterns := []string{"*.tmp", "**/*.log", ".git/**", "vendor/**"}
+	tests := []struct {
+		path     string
+		expected bool
+	}{
+		{"file.tmp", true},
+		{"file.go", false},
+		{"subdir/file.log", true},
+		{"subdir/file.txt", false},
+		{".git/config", true},
+		{"src/.git/config", false},
+		{"vendor/package/file.go", true},
+		{"internal/vendor/file.go", false},
+	}
+
+	for _, test := range tests {
+		result := isIgnored(test.path, patterns)
+		if result != test.expected {
+			t.Errorf("isIgnored(%q, %v) = %v; want %v", test.path, patterns, result, test.expected)
+		}
+	}
+}
+
+func TestRunRollup(t *testing.T) {
+	// Create a temporary directory for testing
+	tempDir, err := os.MkdirTemp("", "rollup_test")
+	if err != nil {
+		t.Fatalf("Failed to create temp dir: %v", err)
+	}
+	defer os.RemoveAll(tempDir)
+
+	// Create some test files
+	files := map[string]string{
+		"file1.go":             "package main\n\nfunc main() {}\n",
+		"file2.txt":            "This is a text file.\n",
+		"subdir/file3.go":      "package subdir\n\nfunc Func() {}\n",
+		"subdir/file4.json":    "{\"key\": \"value\"}\n",
+		"generated_model.go":   "// Code generated DO NOT EDIT.\n\npackage model\n",
+		"docs/api/readme.md":   "# API Documentation\n",
+		".git/config":          "[core]\n\trepositoryformatversion = 0\n",
+		"vendor/lib/helper.go": "package lib\n\nfunc Helper() {}\n",
+	}
+
+	for name, content := range files {
+		path := filepath.Join(tempDir, name)
+		if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
+			t.Fatalf("Failed to create directory: %v", err)
+		}
+		if err := os.WriteFile(path, []byte(content), 0o644); err != nil {
+			t.Fatalf("Failed to write file: %v", err)
+		}
+	}
+
+	// Set up test configuration
+	cfg = &config.Config{
+		FileTypes:     []string{"go", "txt", "md"},
+		Ignore:        []string{"*.json", ".git/**", "vendor/**"},
+		CodeGenerated: []string{"generated_*.go"},
+	}
+
+	// Change working directory to the temp directory
+	originalWd, _ := os.Getwd()
+	os.Chdir(tempDir)
+	defer os.Chdir(originalWd)
+
+	// Run the rollup
+	if err := runRollup(cfg); err != nil {
+		t.Fatalf("runRollup() failed: %v", err)
+	}
+
+	// Check if the output file was created
+	outputFiles, err := filepath.Glob("*.rollup.md")
+	if err != nil {
+		t.Fatalf("Error globbing for output file: %v", err)
+	}
+	if len(outputFiles) == 0 {
+		allFiles, _ := filepath.Glob("*")
+		t.Fatalf("No rollup.md file found. Files in directory: %v", allFiles)
+	}
+	outputFile := outputFiles[0]
+
+	// Read the content of the output file
+	content, err := os.ReadFile(outputFile)
+	if err != nil {
+		t.Fatalf("Failed to read output file: %v", err)
+	}
+
+	// Check if the content includes the expected files
+	expectedContent := []string{
+		"# File: file1.go",
+		"# File: file2.txt",
+		"# File: subdir/file3.go",
+		"# File: docs/api/readme.md",
+		"# File: generated_model.go (Code-generated, Read-only)",
+	}
+	for _, expected := range expectedContent {
+		if !strings.Contains(string(content), expected) {
+			t.Errorf("Output file does not contain expected content: %s", expected)
+		}
+	}
+
+	// Check if the ignored files are not included
+	ignoredContent := []string{
+		"file4.json",
+		".git/config",
+		"vendor/lib/helper.go",
+	}
+	for _, ignored := range ignoredContent {
+		if strings.Contains(string(content), ignored) {
+			t.Errorf("Output file contains ignored file: %s", ignored)
+		}
+	}
+}
--- a/cmd/generate.go
+++ b/cmd/generate.go
@@ -0,0 +1,80 @@
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+
+	"github.com/spf13/cobra"
+	"github.com/tnypxl/rollup/internal/config"
+	"gopkg.in/yaml.v2"
+)
+
+var generateCmd = &cobra.Command{
+	Use:   "generate",
+	Short: "Generate a rollup.yml config file",
+	Long:  `Scan the current directory for text and code files and generate a rollup.yml config file based on the found file extensions.`,
+	RunE:  runGenerate,
+}
+
+func runGenerate(cmd *cobra.Command, args []string) error {
+	fileTypes := make(map[string]bool)
+	err := filepath.Walk(".", func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if !info.IsDir() {
+			ext := strings.TrimPrefix(filepath.Ext(path), ".")
+			if isTextFile(ext) {
+				fileTypes[ext] = true
+			}
+		}
+		return nil
+	})
+	if err != nil {
+		return fmt.Errorf("error walking the path: %v", err)
+	}
+
+	cfg := config.Config{
+		FileTypes: make([]string, 0, len(fileTypes)),
+		Ignore:    []string{"node_modules/**", "vendor/**", ".git/**"},
+	}
+
+	for ext := range fileTypes {
+		cfg.FileTypes = append(cfg.FileTypes, ext)
+	}
+
+	// Sort file types for consistency
+	sort.Strings(cfg.FileTypes)
+
+	yamlData, err := yaml.Marshal(&cfg)
+	if err != nil {
+		return fmt.Errorf("error marshaling config: %v", err)
+	}
+
+	outputPath := config.DefaultConfigPath()
+	err = os.WriteFile(outputPath, yamlData, 0644)
+	if err != nil {
+		return fmt.Errorf("error writing config file: %v", err)
+	}
+
+	fmt.Printf("Generated %s file successfully.\n", outputPath)
+	return nil
+}
+
+func isTextFile(ext string) bool {
+	textExtensions := map[string]bool{
+		"txt": true, "md": true, "go": true, "py": true, "js": true, "html": true, "css": true,
+		"json": true, "xml": true, "yaml": true, "yml": true, "toml": true, "ini": true,
+		"sh": true, "bash": true, "zsh": true, "fish": true,
+		"c": true, "cpp": true, "h": true, "hpp": true, "java": true, "kt": true, "scala": true,
+		"rs": true, "rb": true, "php": true, "ts": true, "swift": true,
+	}
+	return textExtensions[ext]
+}
+
+func init() {
+	// Add any flags for the generate command here if needed
+}
--- a/cmd/root.go
+++ b/cmd/root.go
@@ -0,0 +1,35 @@
+package cmd
+
+import (
+	"github.com/spf13/cobra"
+	config "github.com/tnypxl/rollup/internal/config"
+)
+
+var (
+	configFile string
+	verbose    bool
+)
+
+var rootCmd = &cobra.Command{
+	Use:   "rollup",
+	Short: "Rollup is a tool for combining and processing files",
+	Long: `Rollup is a versatile tool that can combine and process files in various ways.
+Use subcommands to perform specific operations.`,
+}
+
+func Execute(conf *config.Config) error {
+	if conf == nil {
+		conf = &config.Config{} // Use an empty config if none is provided
+	}
+	cfg = conf // Set the cfg variable in cmd/files.go
+	return rootCmd.Execute()
+}
+
+func init() {
+	rootCmd.PersistentFlags().StringVarP(&configFile, "config", "f", "", "Path to the config file (default: rollup.yml in the current directory)")
+	rootCmd.PersistentFlags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose logging")
+
+	rootCmd.AddCommand(filesCmd)
+	rootCmd.AddCommand(webCmd)
+	rootCmd.AddCommand(generateCmd)
+}
--- a/cmd/web.go
+++ b/cmd/web.go
@@ -0,0 +1,211 @@
+package cmd
+
+import (
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"strings"
+	"time"
+
+	"github.com/spf13/cobra"
+	"github.com/tnypxl/rollup/internal/config"
+	"github.com/tnypxl/rollup/internal/scraper"
+)
+
+var (
+	urls             []string
+	outputType       string
+	depth            int
+	includeSelector  string
+	excludeSelectors []string
+)
+
+var webCmd = &cobra.Command{
+	Use:   "web",
+	Short: "Scrape main content from webpages and convert to Markdown",
+	Long:  `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
+	RunE:  runWeb,
+}
+
+func init() {
+	webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
+	webCmd.Flags().StringVarP(&outputType, "output", "o", "single", "Output type: 'single' for one file, 'separate' for multiple files")
+	webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)")
+	webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content")
+	webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)")
+}
+
+func validateScrapeConfig(scrapeConfig config.ScrapeConfig) error {
+	if scrapeConfig.RequestsPerSecond <= 0 {
+		return fmt.Errorf("requests_per_second must be greater than 0")
+	}
+	if scrapeConfig.BurstLimit <= 0 {
+		return fmt.Errorf("burst_limit must be greater than 0")
+	}
+	return nil
+}
+
+func runWeb(cmd *cobra.Command, args []string) error {
+    scraper.SetupLogger(verbose)
+    logger := log.New(os.Stdout, "WEB: ", log.LstdFlags)
+    if !verbose {
+        logger.SetOutput(io.Discard)
+    }
+    logger.Printf("Starting web scraping process with verbose mode: %v", verbose)
+
+    // Prepare site configurations
+    var siteConfigs []scraper.SiteConfig
+    if len(cfg.Scrape.Sites) > 0 {
+        // Use configurations from rollup.yml
+        logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites))
+        siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites))
+        for i, site := range cfg.Scrape.Sites {
+            siteConfigs[i] = scraper.SiteConfig{
+                BaseURL:          site.BaseURL,
+                CSSLocator:       site.CSSLocator,
+                ExcludeSelectors: site.ExcludeSelectors,
+                MaxDepth:         site.MaxDepth,
+                AllowedPaths:     site.AllowedPaths,
+                ExcludePaths:     site.ExcludePaths,
+                OutputAlias:      site.OutputAlias,
+                PathOverrides:    convertPathOverrides(site.PathOverrides),
+            }
+            logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d, AllowedPaths=%v",
+                i+1, site.BaseURL, site.CSSLocator, site.MaxDepth, site.AllowedPaths)
+        }
+    } else {
+        // Use command-line URLs
+        if len(urls) == 0 {
+            logger.Println("Error: No URLs provided via --urls flag")
+            return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.sites' in the rollup.yml file")
+        }
+        siteConfigs = make([]scraper.SiteConfig, len(urls))
+        for i, u := range urls {
+            siteConfigs[i] = scraper.SiteConfig{
+                BaseURL:          u,
+                CSSLocator:       includeSelector,
+                ExcludeSelectors: excludeSelectors,
+                MaxDepth:         depth,
+                AllowedPaths:     []string{"/"}, // Allow all paths by default
+            }
+            logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d",
+                i+1, u, includeSelector, depth)
+        }
+    }
+
+    // Set up scraper configuration
+    scraperConfig := scraper.Config{
+        Sites:      siteConfigs,
+        OutputType: outputType,
+        Verbose:    verbose,
+        Scrape: scraper.ScrapeConfig{
+            RequestsPerSecond: cfg.Scrape.RequestsPerSecond,
+            BurstLimit:        cfg.Scrape.BurstLimit,
+        },
+    }
+    logger.Printf("Scraper configuration: OutputType=%s, RequestsPerSecond=%f, BurstLimit=%d",
+        outputType, scraperConfig.Scrape.RequestsPerSecond, scraperConfig.Scrape.BurstLimit)
+
+    // Validate scrape configuration
+    err := validateScrapeConfig(cfg.Scrape)
+    if err != nil {
+    	logger.Printf("Invalid scrape configuration: %v", err)
+    	return err
+    }
+
+    // Start scraping using scraper.ScrapeSites
+    logger.Println("Starting scraping process")
+    scrapedContent, err := scraper.ScrapeSites(scraperConfig)
+    if err != nil {
+        logger.Printf("Error occurred during scraping: %v", err)
+        return fmt.Errorf("error scraping content: %v", err)
+    }
+    logger.Printf("Scraping completed. Total content scraped: %d", len(scrapedContent))
+
+    // Write output to files
+    if outputType == "single" {
+        logger.Println("Writing content to a single file")
+        return writeSingleFile(scrapedContent)
+    } else {
+        logger.Println("Writing content to multiple files")
+        return writeMultipleFiles(scrapedContent)
+    }
+}
+
+func writeSingleFile(content map[string]string) error {
+	outputFile := generateDefaultFilename()
+	file, err := os.Create(outputFile)
+	if err != nil {
+		return fmt.Errorf("error creating output file: %v", err)
+	}
+	defer file.Close()
+
+	for url, c := range content {
+		_, err = fmt.Fprintf(file, "# ::: Content from %s\n\n%s\n\n---\n\n", url, c)
+		if err != nil {
+			return fmt.Errorf("error writing content to file: %v", err)
+		}
+	}
+
+	fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile)
+	return nil
+}
+
+func writeMultipleFiles(content map[string]string) error {
+	for url, c := range content {
+		filename := sanitizeFilename(url) + ".rollup.md"
+		file, err := os.Create(filename)
+		if err != nil {
+			return fmt.Errorf("error creating output file %s: %v", filename, err)
+		}
+
+		_, err = file.WriteString(fmt.Sprintf("# ::: Content from %s\n\n%s\n", url, c))
+		if err != nil {
+			file.Close()
+			return fmt.Errorf("error writing content to file %s: %v", filename, err)
+		}
+
+		file.Close()
+		fmt.Printf("Content from %s has been saved to %s\n", url, filename)
+	}
+
+	return nil
+}
+
+func generateDefaultFilename() string {
+	timestamp := time.Now().Format("20060102-150405")
+	return fmt.Sprintf("web-%s.rollup.md", timestamp)
+}
+
+func sanitizeFilename(name string) string {
+	// Remove any character that isn't alphanumeric, dash, or underscore
+	name = strings.Map(func(r rune) rune {
+		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '-' || r == '_' {
+			return r
+		}
+		return '_'
+	}, name)
+
+	// Trim any leading or trailing underscores
+	name = strings.Trim(name, "_")
+
+	// If the name is empty after sanitization, use a default name
+	if name == "" {
+		name = "untitled"
+	}
+
+	return name
+}
+
+func convertPathOverrides(configOverrides []config.PathOverride) []scraper.PathOverride {
+	scraperOverrides := make([]scraper.PathOverride, len(configOverrides))
+	for i, override := range configOverrides {
+		scraperOverrides[i] = scraper.PathOverride{
+			Path:             override.Path,
+			CSSLocator:       override.CSSLocator,
+			ExcludeSelectors: override.ExcludeSelectors,
+		}
+	}
+	return scraperOverrides
+}
--- a/cmd/web_test.go
+++ b/cmd/web_test.go
@@ -0,0 +1,154 @@
+package cmd
+
+import (
+	"testing"
+	"strings"
+	"github.com/tnypxl/rollup/internal/config"
+)
+
+func TestConvertPathOverrides(t *testing.T) {
+	configOverrides := []config.PathOverride{
+		{
+			Path:             "/blog",
+			CSSLocator:       "article",
+			ExcludeSelectors: []string{".ads", ".comments"},
+		},
+		{
+			Path:             "/products",
+			CSSLocator:       ".product-description",
+			ExcludeSelectors: []string{".related-items"},
+		},
+	}
+
+	scraperOverrides := convertPathOverrides(configOverrides)
+
+	if len(scraperOverrides) != len(configOverrides) {
+		t.Errorf("Expected %d overrides, got %d", len(configOverrides), len(scraperOverrides))
+	}
+
+	for i, override := range scraperOverrides {
+		if override.Path != configOverrides[i].Path {
+			t.Errorf("Expected Path %s, got %s", configOverrides[i].Path, override.Path)
+		}
+		if override.CSSLocator != configOverrides[i].CSSLocator {
+			t.Errorf("Expected CSSLocator %s, got %s", configOverrides[i].CSSLocator, override.CSSLocator)
+		}
+		if len(override.ExcludeSelectors) != len(configOverrides[i].ExcludeSelectors) {
+			t.Errorf("Expected %d ExcludeSelectors, got %d", len(configOverrides[i].ExcludeSelectors), len(override.ExcludeSelectors))
+		}
+		for j, selector := range override.ExcludeSelectors {
+			if selector != configOverrides[i].ExcludeSelectors[j] {
+				t.Errorf("Expected ExcludeSelector %s, got %s", configOverrides[i].ExcludeSelectors[j], selector)
+			}
+		}
+	}
+}
+
+func TestSanitizeFilename(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected string
+	}{
+		{"Hello, World!", "Hello_World"},
+		{"file/with/path", "file_with_path"},
+		{"file.with.dots", "file_with_dots"},
+		{"___leading_underscores___", "leading_underscores"},
+		{"", "untitled"},
+		{"!@#$%^&*()", "untitled"},
+	}
+
+	for _, test := range tests {
+		result := sanitizeFilename(test.input)
+		if result != test.expected {
+			t.Errorf("sanitizeFilename(%q) = %q; want %q", test.input, result, test.expected)
+		}
+	}
+}
+
+func TestGetFilenameFromContent(t *testing.T) {
+	tests := []struct {
+		content  string
+		url      string
+		expected string
+		expectErr bool
+	}{
+		{"<title>Test Page</title>", "http://example.com", "Test_Page.rollup.md", false},
+		{"No title here", "http://example.com/page", "example_com_page.rollup.md", false},
+		{"<title>  Trim  Me  </title>", "http://example.com", "Trim_Me.rollup.md", false},
+		{"<title></title>", "http://example.com", "example_com.rollup.md", false},
+		{"<title>   </title>", "http://example.com", "example_com.rollup.md", false},
+		{"Invalid URL", "not a valid url", "", true},
+		{"No host", "http://", "", true},
+	}
+
+	for _, test := range tests {
+		result, err := getFilenameFromContent(test.content, test.url)
+		if test.expectErr {
+			if err == nil {
+				t.Errorf("getFilenameFromContent(%q, %q) expected an error, but got none", test.content, test.url)
+			}
+		} else {
+			if err != nil {
+				t.Errorf("getFilenameFromContent(%q, %q) unexpected error: %v", test.content, test.url, err)
+			}
+			if result != test.expected {
+				t.Errorf("getFilenameFromContent(%q, %q) = %q; want %q", test.content, test.url, result, test.expected)
+			}
+		}
+	}
+}
+
+// Mock functions for testing
+func mockExtractAndConvertContent(urlStr string) (string, error) {
+	return "Mocked content for " + urlStr, nil
+}
+
+func mockExtractLinks() ([]string, error) {
+	return []string{"http://example.com/link1", "http://example.com/link2"}, nil
+}
+
+func TestScrapeURL(t *testing.T) {
+	// Store the original functions
+	originalExtractAndConvertContent := testExtractAndConvertContent
+	originalExtractLinks := testExtractLinks
+
+	// Define mock functions
+	testExtractAndConvertContent = func(urlStr string) (string, error) {
+		return "Mocked content for " + urlStr, nil
+	}
+	testExtractLinks = func(urlStr string) ([]string, error) {
+		return []string{"http://example.com/link1", "http://example.com/link2"}, nil
+	}
+
+	// Defer the restoration of original functions
+	defer func() {
+		testExtractAndConvertContent = originalExtractAndConvertContent
+		testExtractLinks = originalExtractLinks
+	}()
+
+	tests := []struct {
+		url           string
+		depth         int
+		expectedCalls int
+	}{
+		{"http://example.com", 0, 1},
+		{"http://example.com", 1, 3},
+		{"http://example.com", 2, 3}, // Same as depth 1 because our mock only returns 2 links
+	}
+
+	for _, test := range tests {
+		visited := make(map[string]bool)
+		content, err := scrapeURL(test.url, test.depth, visited)
+		if err != nil {
+			t.Errorf("scrapeURL(%q, %d) returned error: %v", test.url, test.depth, err)
+			continue
+		}
+		if len(visited) != test.expectedCalls {
+			t.Errorf("scrapeURL(%q, %d) made %d calls, expected %d", test.url, test.depth, len(visited), test.expectedCalls)
+		}
+		expectedContent := "Mocked content for " + test.url
+		if !strings.Contains(content, expectedContent) {
+			t.Errorf("scrapeURL(%q, %d) content doesn't contain %q", test.url, test.depth, expectedContent)
+		}
+	}
+}
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -0,0 +1,21 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.0.3] - 2024-09-22
+
+### Added
+- Implemented web scraping functionality using Playwright
+- Added support for CSS selectors to extract specific content
+- Introduced rate limiting for web requests
+- Created configuration options for scraping settings
+
+### Changed
+- Improved error handling and logging throughout the application
+- Enhanced URL parsing and validation
+
+### Fixed
+- Resolved issues with concurrent scraping operations
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -0,0 +1,72 @@
+package config
+
+import (
+	"fmt"
+	"os"
+
+	"gopkg.in/yaml.v2"
+)
+
+type Config struct {
+	FileTypes     []string     `yaml:"file_types"`
+	Ignore        []string     `yaml:"ignore"`
+	CodeGenerated []string     `yaml:"code_generated"`
+	Scrape        ScrapeConfig `yaml:"scrape"`
+}
+
+type ScrapeConfig struct {
+    Sites             []SiteConfig `yaml:"sites"`
+    OutputType        string       `yaml:"output_type"`
+    RequestsPerSecond float64      `yaml:"requests_per_second"`
+    BurstLimit        int          `yaml:"burst_limit"`
+}
+
+type SiteConfig struct {
+    BaseURL          string            `yaml:"base_url"`
+    CSSLocator       string            `yaml:"css_locator"`
+    ExcludeSelectors []string          `yaml:"exclude_selectors"`
+    MaxDepth         int               `yaml:"max_depth"`
+    AllowedPaths     []string          `yaml:"allowed_paths"`
+    ExcludePaths     []string          `yaml:"exclude_paths"`
+    OutputAlias      string            `yaml:"output_alias"`
+    PathOverrides    []PathOverride    `yaml:"path_overrides"`
+    LinksContainerSelector string      `yaml:"links_container_selector"`
+}
+
+type PathOverride struct {
+    Path             string   `yaml:"path"`
+    CSSLocator       string   `yaml:"css_locator"`
+    ExcludeSelectors []string `yaml:"exclude_selectors"`
+}
+
+func Load(configPath string) (*Config, error) {
+	data, err := os.ReadFile(configPath)
+	if err != nil {
+		return nil, fmt.Errorf("error reading config file: %v", err)
+	}
+
+	var config Config
+	err = yaml.Unmarshal(data, &config)
+	if err != nil {
+		return nil, fmt.Errorf("error parsing config file: %v", err)
+	}
+
+	// Set default values if they are zero or missing
+	if config.Scrape.RequestsPerSecond <= 0 {
+		config.Scrape.RequestsPerSecond = 1.0
+	}
+	if config.Scrape.BurstLimit <= 0 {
+		config.Scrape.BurstLimit = 5
+	}
+	return &config, nil
+}
+
+func DefaultConfigPath() string {
+	return "rollup.yml"
+}
+
+func FileExists(filename string) bool {
+	_, err := os.Stat(filename)
+	return err == nil
+}
+
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -0,0 +1,120 @@
+package config
+
+import (
+	"os"
+	"reflect"
+	"testing"
+)
+
+func TestLoad(t *testing.T) {
+	// Create a temporary config file
+	content := []byte(`
+file_types:
+  - go
+  - md
+ignore:
+  - "*.tmp"
+  - "**/*.log"
+code_generated:
+  - "generated_*.go"
+scrape:
+  sites:
+    - base_url: "https://example.com"
+      css_locator: "main"
+      exclude_selectors:
+        - ".ads"
+      max_depth: 2
+      allowed_paths:
+        - "/blog"
+      exclude_paths:
+        - "/admin"
+      output_alias: "example"
+      path_overrides:
+        - path: "/special"
+          css_locator: ".special-content"
+          exclude_selectors:
+            - ".sidebar"
+  output_type: "single"
+  requests_per_second: 1.0
+  burst_limit: 5
+`)
+
+	tmpfile, err := os.CreateTemp("", "config*.yml")
+	if err != nil {
+		t.Fatalf("Failed to create temp file: %v", err)
+	}
+	defer os.Remove(tmpfile.Name())
+
+	if _, err = tmpfile.Write(content); err != nil {
+		t.Fatalf("Failed to write to temp file: %v", err)
+	}
+	if err = tmpfile.Close(); err != nil {
+		t.Fatalf("Failed to close temp file: %v", err)
+	}
+
+	// Test loading the config
+	config, err := Load(tmpfile.Name())
+	if err != nil {
+		t.Fatalf("Load() failed: %v", err)
+	}
+
+	// Check if the loaded config matches the expected values
+	expectedConfig := &Config{
+		FileTypes:     []string{"go", "md"},
+		Ignore:        []string{"*.tmp", "**/*.log"},
+		CodeGenerated: []string{"generated_*.go"},
+		Scrape: ScrapeConfig{
+			Sites: []SiteConfig{
+				{
+					BaseURL:          "https://example.com",
+					CSSLocator:       "main",
+					ExcludeSelectors: []string{".ads"},
+					MaxDepth:         2,
+					AllowedPaths:     []string{"/blog"},
+					ExcludePaths:     []string{"/admin"},
+					OutputAlias:      "example",
+					PathOverrides: []PathOverride{
+						{
+							Path:             "/special",
+							CSSLocator:       ".special-content",
+							ExcludeSelectors: []string{".sidebar"},
+						},
+					},
+				},
+			},
+			OutputType:        "single",
+			RequestsPerSecond: 1.0,
+			BurstLimit:        5,
+		},
+	}
+
+	if !reflect.DeepEqual(config, expectedConfig) {
+		t.Errorf("Loaded config does not match expected config.\nGot: %+v\nWant: %+v", config, expectedConfig)
+	}
+}
+
+func TestDefaultConfigPath(t *testing.T) {
+	expected := "rollup.yml"
+	result := DefaultConfigPath()
+	if result != expected {
+		t.Errorf("DefaultConfigPath() = %q, want %q", result, expected)
+	}
+}
+
+func TestFileExists(t *testing.T) {
+	// Test with an existing file
+	tmpfile, err := os.CreateTemp("", "testfile")
+	if err != nil {
+		t.Fatalf("Failed to create temp file: %v", err)
+	}
+	defer os.Remove(tmpfile.Name())
+
+	if !FileExists(tmpfile.Name()) {
+		t.Errorf("FileExists(%q) = false, want true", tmpfile.Name())
+	}
+
+	// Test with a non-existing file
+	if FileExists("non_existing_file.txt") {
+		t.Errorf("FileExists(\"non_existing_file.txt\") = true, want false")
+	}
+}
--- a/internal/scraper/scraper.go
+++ b/internal/scraper/scraper.go
@@ -0,0 +1,676 @@
+package scraper
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"log"
+	"math/rand"
+	"net/url"
+	"os"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+
+	md "github.com/JohannesKaufmann/html-to-markdown"
+	"github.com/PuerkitoBio/goquery"
+	"github.com/playwright-community/playwright-go"
+	"golang.org/x/time/rate"
+)
+
+var logger *log.Logger
+
+var (
+	pw      *playwright.Playwright
+	browser playwright.Browser
+)
+
+// Config holds the scraper configuration
+type Config struct {
+	Sites      []SiteConfig
+	OutputType string
+	Verbose    bool
+	Scrape     ScrapeConfig
+}
+
+// ScrapeConfig holds the scraping-specific configuration
+type ScrapeConfig struct {
+	RequestsPerSecond float64
+	BurstLimit        int
+}
+
+// SiteConfig holds configuration for a single site
+type SiteConfig struct {
+	BaseURL          string
+	CSSLocator       string
+	ExcludeSelectors []string
+	MaxDepth         int
+	AllowedPaths     []string
+	ExcludePaths     []string
+	OutputAlias      string
+	PathOverrides    []PathOverride
+	LinksContainerSelector string
+}
+
+// PathOverride holds path-specific overrides
+type PathOverride struct {
+	Path             string
+	CSSLocator       string
+	ExcludeSelectors []string
+}
+
+func ScrapeSites(config Config) (map[string]string, error) {
+	logger.Println("Starting ScrapeSites function - Verbose mode is active")
+	results := make(chan struct {
+		url     string
+		content string
+		err     error
+	})
+
+	// Ensure RequestsPerSecond and BurstLimit are valid
+	if config.Scrape.RequestsPerSecond <= 0 {
+		config.Scrape.RequestsPerSecond = 1.0
+	}
+	if config.Scrape.BurstLimit <= 0 {
+		config.Scrape.BurstLimit = 5
+	}
+
+	limiter := rate.NewLimiter(rate.Limit(config.Scrape.RequestsPerSecond), config.Scrape.BurstLimit)
+	logger.Printf("Rate limiter configured with %f requests per second and burst limit of %d\n", config.Scrape.RequestsPerSecond, config.Scrape.BurstLimit)
+
+	var wg sync.WaitGroup
+	totalURLs := 0
+	var mu sync.Mutex
+	for _, site := range config.Sites {
+		logger.Printf("Processing site: %s\n", site.BaseURL)
+		wg.Add(1)
+		go func(site SiteConfig) {
+			defer wg.Done()
+			visited := make(map[string]bool)
+			for _, path := range site.AllowedPaths {
+				fullURL := site.BaseURL + path
+				mu.Lock()
+				totalURLs++
+				mu.Unlock()
+				logger.Printf("Queueing URL for scraping: %s\n", fullURL)
+				scrapeSingleURL(fullURL, site, results, limiter, visited, 0)
+			}
+		}(site)
+	}
+
+	go func() {
+		wg.Wait()
+		close(results)
+		logger.Println("All goroutines completed, results channel closed")
+	}()
+
+	scrapedContent := make(map[string]string)
+	for result := range results {
+		if result.err != nil {
+			logger.Printf("Error scraping %s: %v\n", result.url, result.err)
+			continue
+		}
+		logger.Printf("Successfully scraped content from %s (length: %d)\n", result.url, len(result.content))
+		scrapedContent[result.url] = result.content
+	}
+
+	logger.Printf("Total URLs processed: %d\n", totalURLs)
+	logger.Printf("Successfully scraped content from %d URLs\n", len(scrapedContent))
+
+	return scrapedContent, nil
+}
+
+func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
+	url     string
+	content string
+	err     error
+}, limiter *rate.Limiter, visited map[string]bool, currentDepth int) {
+	if site.MaxDepth > 0 && currentDepth > site.MaxDepth {
+		return
+	}
+
+	if visited[url] {
+		return
+	}
+	visited[url] = true
+
+	logger.Printf("Starting to scrape URL: %s\n", url)
+
+	// Wait for rate limiter before making the request
+	err := limiter.Wait(context.Background())
+	if err != nil {
+		logger.Printf("Rate limiter error for %s: %v\n", url, err)
+		results <- struct {
+			url     string
+			content string
+			err     error
+		}{url, "", fmt.Errorf("rate limiter error: %v", err)}
+		return
+	}
+
+	content, err := FetchWebpageContent(url)
+	if err != nil {
+		logger.Printf("Error fetching content for %s: %v\n", url, err)
+		results <- struct {
+			url     string
+			content string
+			err     error
+		}{url, "", err}
+		return
+	}
+
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
+	if err != nil {
+		logger.Printf("Error parsing HTML for %s: %v\n", url, err)
+		results <- struct {
+			url     string
+			content string
+			err     error
+		}{url, "", fmt.Errorf("error parsing HTML: %v", err)}
+		return
+	}
+
+	if site.LinksContainerSelector != "" {
+		logger.Printf("Processing links container for %s\n", url)
+		linkContainers := doc.Find(site.LinksContainerSelector)
+		linkContainers.Each(func(i int, container *goquery.Selection) {
+			container.Find("a[href]").Each(func(j int, link *goquery.Selection) {
+				href, exists := link.Attr("href")
+				if exists {
+					resolvedURL := resolveURL(href, url)
+					if isAllowedURL(resolvedURL, site) && !visited[resolvedURL] {
+						go scrapeSingleURL(resolvedURL, site, results, limiter, visited, currentDepth+1)
+					}
+				}
+			})
+		})
+		return
+	}
+
+	cssLocator, excludeSelectors := getOverrides(url, site)
+	logger.Printf("Using CSS locator for %s: %s\n", url, cssLocator)
+	logger.Printf("Exclude selectors for %s: %v\n", url, excludeSelectors)
+
+	extractedContent, err := ExtractContentWithCSS(content, cssLocator, excludeSelectors)
+	if err != nil {
+		logger.Printf("Error extracting content for %s: %v\n", url, err)
+		results <- struct {
+			url     string
+			content string
+			err     error
+		}{url, "", err}
+		return
+	}
+
+	if extractedContent == "" {
+		logger.Printf("Warning: Empty content scraped from %s\n", url)
+	} else {
+		logger.Printf("Successfully scraped content from %s (length: %d)\n", url, len(extractedContent))
+	}
+
+	results <- struct {
+		url     string
+		content string
+		err     error
+	}{url, extractedContent, nil}
+}
+
+func scrapeSite(site SiteConfig, results chan<- struct {
+	url     string
+	content string
+	err     error
+}, limiter *rate.Limiter,
+) {
+	visited := make(map[string]bool)
+	queue := []string{site.BaseURL}
+
+	for len(queue) > 0 {
+		url := queue[0]
+		queue = queue[1:]
+
+		if visited[url] {
+			continue
+		}
+		visited[url] = true
+
+		if !isAllowedURL(url, site) {
+			continue
+		}
+
+		// Wait for rate limiter before making the request
+		err := limiter.Wait(context.Background())
+		if err != nil {
+			results <- struct {
+				url     string
+				content string
+				err     error
+			}{url, "", fmt.Errorf("rate limiter error: %v", err)}
+			continue
+		}
+
+		cssLocator, excludeSelectors := getOverrides(url, site)
+		content, err := scrapeURL(url, cssLocator, excludeSelectors)
+		results <- struct {
+			url     string
+			content string
+			err     error
+		}{url, content, err}
+
+		if len(visited) < site.MaxDepth {
+			links, _ := ExtractLinks(url)
+			for _, link := range links {
+				if !visited[link] && isAllowedURL(link, site) {
+					queue = append(queue, link)
+				}
+			}
+		}
+	}
+}
+
+func isAllowedURL(urlStr string, site SiteConfig) bool {
+	parsedURL, err := url.Parse(urlStr)
+	if err != nil {
+		return false
+	}
+
+	baseURL, _ := url.Parse(site.BaseURL)
+	if parsedURL.Host != baseURL.Host {
+		return false
+	}
+
+	path := parsedURL.Path
+	
+	// Check if the URL is within allowed paths
+	if len(site.AllowedPaths) > 0 {
+		allowed := false
+		for _, allowedPath := range site.AllowedPaths {
+			if strings.HasPrefix(path, allowedPath) {
+				allowed = true
+				break
+			}
+		}
+		if !allowed {
+			return false
+		}
+	}
+
+	// Check if the URL is in excluded paths
+	for _, excludePath := range site.ExcludePaths {
+		if strings.HasPrefix(path, excludePath) {
+			return false
+		}
+	}
+
+	return true
+}
+
+func getOverrides(urlStr string, site SiteConfig) (string, []string) {
+	parsedURL, _ := url.Parse(urlStr)
+	path := parsedURL.Path
+
+	for _, override := range site.PathOverrides {
+		if strings.HasPrefix(path, override.Path) {
+			if override.CSSLocator != "" {
+				return override.CSSLocator, override.ExcludeSelectors
+			}
+			return site.CSSLocator, override.ExcludeSelectors
+		}
+	}
+
+	return site.CSSLocator, site.ExcludeSelectors
+}
+
+func scrapeURL(url, cssLocator string, excludeSelectors []string) (string, error) {
+	content, err := FetchWebpageContent(url)
+	if err != nil {
+		return "", err
+	}
+
+	if cssLocator != "" {
+		content, err = ExtractContentWithCSS(content, cssLocator, excludeSelectors)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	return ProcessHTMLContent(content, Config{})
+}
+
+func getFilenameFromContent(content, url string) string {
+	// Try to extract title from content
+	titleStart := strings.Index(content, "<title>")
+	titleEnd := strings.Index(content, "</title>")
+	if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart {
+		title := content[titleStart+7 : titleEnd]
+		return sanitizeFilename(title) + ".md"
+	}
+
+	// If no title found, use the URL
+	return sanitizeFilename(url) + ".md"
+}
+
+func sanitizeFilename(name string) string {
+	// Remove any character that isn't alphanumeric, dash, or underscore
+	reg, _ := regexp.Compile("[^a-zA-Z0-9-_]+")
+	return reg.ReplaceAllString(name, "_")
+}
+
+// URLConfig holds configuration for a single URL
+type URLConfig struct {
+	URL              string
+	CSSLocator       string
+	ExcludeSelectors []string
+	OutputAlias      string
+}
+
+// SetupLogger initializes the logger based on the verbose flag
+func SetupLogger(verbose bool) {
+	if verbose {
+		logger = log.New(os.Stdout, "SCRAPER: ", log.LstdFlags)
+	} else {
+		logger = log.New(io.Discard, "", 0)
+	}
+}
+
+// InitPlaywright initializes Playwright and launches the browser
+func InitPlaywright() error {
+	logger.Println("Initializing Playwright")
+	var err error
+
+	// Install Playwright and Chromium browser
+	err = playwright.Install(&playwright.RunOptions{Browsers: []string{"chromium"}})
+	if err != nil {
+		return fmt.Errorf("could not install Playwright and Chromium: %v", err)
+	}
+
+	pw, err = playwright.Run()
+	if err != nil {
+		return fmt.Errorf("could not start Playwright: %v", err)
+	}
+
+	userAgent := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+
+	browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
+		Args: []string{fmt.Sprintf("--user-agent=%s", userAgent)},
+	})
+	if err != nil {
+		return fmt.Errorf("could not launch browser: %v", err)
+	}
+
+	logger.Println("Playwright initialized successfully")
+	return nil
+}
+
+// ClosePlaywright closes the browser and stops Playwright
+func ClosePlaywright() {
+	if browser != nil {
+		browser.Close()
+	}
+	if pw != nil {
+		pw.Stop()
+	}
+}
+
+// InitBrowser initializes the browser
+func InitBrowser() error {
+	return InitPlaywright()
+}
+
+// CloseBrowser closes the browser
+func CloseBrowser() {
+	ClosePlaywright()
+}
+
+// FetchWebpageContent retrieves the content of a webpage using Playwright
+func FetchWebpageContent(urlStr string) (string, error) {
+	logger.Printf("Fetching webpage content for URL: %s\n", urlStr)
+
+	page, err := browser.NewPage()
+	if err != nil {
+		logger.Printf("Error creating new page: %v\n", err)
+		return "", fmt.Errorf("could not create page: %v", err)
+	}
+	defer page.Close()
+
+	time.Sleep(time.Duration(rand.Intn(2000)+1000) * time.Millisecond)
+
+	logger.Printf("Navigating to URL: %s\n", urlStr)
+	if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
+		WaitUntil: playwright.WaitUntilStateNetworkidle,
+	}); err != nil {
+		logger.Printf("Error navigating to page: %v\n", err)
+		return "", fmt.Errorf("could not go to page: %v", err)
+	}
+
+	logger.Println("Waiting for page load state")
+	err = page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
+		State: playwright.LoadStateNetworkidle,
+	})
+	if err != nil {
+		logger.Printf("Error waiting for page load: %v\n", err)
+		return "", fmt.Errorf("error waiting for page load: %v", err)
+	}
+
+	logger.Println("Scrolling page")
+	err = scrollPage(page)
+	if err != nil {
+		logger.Printf("Error scrolling page: %v\n", err)
+		return "", fmt.Errorf("error scrolling page: %v", err)
+	}
+
+	logger.Println("Waiting for body element")
+	
+	bodyElement := page.Locator("body")
+	err = bodyElement.WaitFor(playwright.LocatorWaitForOptions{
+		State: playwright.WaitForSelectorStateVisible,
+	})
+	if err != nil {
+		logger.Printf("Error waiting for body: %v\n", err)
+		return "", fmt.Errorf("error waiting for body: %v", err)
+	}
+
+	logger.Println("Getting page content")
+	content, err := page.Content()
+	if err != nil {
+		logger.Printf("Error getting page content: %v\n", err)
+		return "", fmt.Errorf("could not get page content: %v", err)
+	}
+
+	if content == "" {
+		logger.Println(" content is empty, falling back to body content")
+		content, err = bodyElement.InnerHTML()
+		if err != nil {
+			logger.Printf("Error getting body content: %v\n", err)
+			return "", fmt.Errorf("could not get body content: %v", err)
+		}
+	}
+
+	logger.Printf("Successfully fetched webpage content (length: %d)\n", len(content))
+	return content, nil
+}
+
+// ProcessHTMLContent converts HTML content to Markdown
+func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
+	logger.Printf("Processing HTML content (length: %d)\n", len(htmlContent))
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
+	if err != nil {
+		logger.Printf("Error parsing HTML: %v\n", err)
+		return "", fmt.Errorf("error parsing HTML: %v", err)
+	}
+
+	selection := doc.Find("body")
+	logger.Println("Processing entire body")
+
+	if selection.Length() == 0 {
+		return "", fmt.Errorf("no content found in the document")
+	}
+
+	content, err := selection.Html()
+	if err != nil {
+		logger.Printf("Error extracting content: %v\n", err)
+		return "", fmt.Errorf("error extracting content: %v", err)
+	}
+
+	// Create a new converter
+	converter := md.NewConverter("", true, nil)
+
+	// Convert HTML to Markdown
+	markdown, err := converter.ConvertString(content)
+	if err != nil {
+		logger.Printf("Error converting HTML to Markdown: %v\n", err)
+		return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
+	}
+
+	logger.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown))
+	return markdown, nil
+}
+
+func scrollPage(page playwright.Page) error {
+	logger.Println("Starting page scroll")
+	script := `
+		() => {
+			window.scrollTo(0, document.body.scrollHeight);
+			return document.body.scrollHeight;
+			// wait for 500 ms
+			new Promise(resolve => setTimeout(resolve, 500));
+		}
+	`
+
+	previousHeight := 0
+	for i := 0; i < 250; i++ {
+		height, err := page.Evaluate(script)
+		if err != nil {
+			logger.Printf("Error scrolling (iteration %d): %v\n", i+1, err)
+			return fmt.Errorf("error scrolling: %v", err)
+		}
+
+		var currentHeight int
+		switch v := height.(type) {
+		case int:
+			currentHeight = v
+		case float64:
+			currentHeight = int(v)
+		default:
+			logger.Printf("Unexpected height type: %T\n", height)
+			return fmt.Errorf("unexpected height type: %T", height)
+		}
+
+		logger.Printf("Scroll iteration %d: height = %d\n", i+1, currentHeight)
+
+		if currentHeight == previousHeight {
+			logger.Println("Reached bottom of the page")
+			break
+		}
+
+		previousHeight = currentHeight
+
+		// Wait for a while before scrolling again
+		
+		
+	}
+
+	logger.Println("Scrolling back to top")
+	_, err := page.Evaluate(`() => { window.scrollTo(0, 0); }`)
+	if err != nil {
+		logger.Printf("Error scrolling back to top: %v\n", err)
+		return fmt.Errorf("error scrolling back to top: %v", err)
+	}
+
+	logger.Println("Page scroll completed")
+	return nil
+}
+
+// ExtractLinks extracts all links from the given URL
+func ExtractLinks(urlStr string) ([]string, error) {
+	logger.Printf("Extracting links from URL: %s\n", urlStr)
+
+	page, err := browser.NewPage()
+	if err != nil {
+		return nil, fmt.Errorf("could not create page: %v", err)
+	}
+	defer page.Close()
+
+	if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
+		WaitUntil: playwright.WaitUntilStateNetworkidle,
+	}); err != nil {
+		return nil, fmt.Errorf("could not go to page: %v", err)
+	}
+
+	links, err := page.Evaluate(`() => {
+		const anchors = document.querySelectorAll('a');
+		return Array.from(anchors).map(a => a.href);
+	}`)
+	if err != nil {
+		return nil, fmt.Errorf("could not extract links: %v", err)
+	}
+
+	var result []string
+	for _, link := range links.([]interface{}) {
+		// Normalize URL by removing trailing slash
+		normalizedLink := strings.TrimRight(link.(string), "/")
+		result = append(result, normalizedLink)
+	}
+
+	logger.Printf("Extracted %d links\n", len(result))
+	return result, nil
+}
+
+// ExtractContentWithCSS extracts content from HTML using a CSS selector
+func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) {
+	logger.Printf("Extracting content with CSS selector: %s\n", includeSelector)
+
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
+	if err != nil {
+		return "", fmt.Errorf("error parsing HTML: %v", err)
+	}
+
+	selection := doc.Find(includeSelector)
+	if selection.Length() == 0 {
+		logger.Printf("Warning: No content found with CSS selector: %s. Falling back to body content.\n", includeSelector)
+		selection = doc.Find("body")
+		if selection.Length() == 0 {
+			return "", fmt.Errorf("no content found in body")
+		}
+	}
+
+	for _, excludeSelector := range excludeSelectors {
+		selection.Find(excludeSelector).Remove()
+	}
+
+	selectedContent, err := selection.Html()
+	if err != nil {
+		return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
+	}
+
+	// Trim leading and trailing whitespace
+	selectedContent = strings.TrimSpace(selectedContent)
+
+	// Normalize newlines
+	selectedContent = strings.ReplaceAll(selectedContent, "\r\n", "\n")
+	selectedContent = strings.ReplaceAll(selectedContent, "\r", "\n")
+
+	// Remove indentation while preserving structure
+	lines := strings.Split(selectedContent, "\n")
+	for i, line := range lines {
+		lines[i] = strings.TrimSpace(line)
+	}
+	selectedContent = strings.Join(lines, "\n")
+
+	// Remove any leading or trailing newlines
+	selectedContent = strings.Trim(selectedContent, "\n")
+
+	logger.Printf("Extracted content length: %d\n", len(selectedContent))
+	return selectedContent, nil
+}
+func resolveURL(href, base string) string {
+    parsedBase, err := url.Parse(base)
+    if err != nil {
+        return href
+    }
+    parsedHref, err := url.Parse(href)
+    if err != nil {
+        return href
+    }
+    return parsedBase.ResolveReference(parsedHref).String()
+}
--- a/internal/scraper/scraper_test.go
+++ b/internal/scraper/scraper_test.go
@@ -0,0 +1,181 @@
+package scraper
+
+import (
+	"io"
+	"log"
+	"net/http"
+	"net/http/httptest"
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestIsAllowedURL(t *testing.T) {
+	site := SiteConfig{
+		BaseURL:      "https://example.com",
+		AllowedPaths: []string{"/blog", "/products"},
+		ExcludePaths: []string{"/admin", "/private"},
+	}
+
+	tests := []struct {
+		url      string
+		expected bool
+	}{
+		{"https://example.com/blog/post1", true},
+		{"https://example.com/products/item1", true},
+		{"https://example.com/admin/dashboard", false},
+		{"https://example.com/private/data", false},
+		{"https://example.com/other/page", false},
+		{"https://othersite.com/blog/post1", false},
+	}
+
+	for _, test := range tests {
+		result := isAllowedURL(test.url, site)
+		if result != test.expected {
+			t.Errorf("isAllowedURL(%q) = %v, want %v", test.url, result, test.expected)
+		}
+	}
+}
+
+func TestGetOverrides(t *testing.T) {
+	site := SiteConfig{
+		CSSLocator:       "main",
+		ExcludeSelectors: []string{".ads"},
+		PathOverrides: []PathOverride{
+			{
+				Path:             "/special",
+				CSSLocator:       ".special-content",
+				ExcludeSelectors: []string{".sidebar"},
+			},
+		},
+	}
+
+	tests := []struct {
+		url              string
+		expectedLocator  string
+		expectedExcludes []string
+	}{
+		{"https://example.com/normal", "main", []string{".ads"}},
+		{"https://example.com/special", ".special-content", []string{".sidebar"}},
+		{"https://example.com/special/page", ".special-content", []string{".sidebar"}},
+	}
+
+	for _, test := range tests {
+		locator, excludes := getOverrides(test.url, site)
+		if locator != test.expectedLocator {
+			t.Errorf("getOverrides(%q) locator = %q, want %q", test.url, locator, test.expectedLocator)
+		}
+		if !reflect.DeepEqual(excludes, test.expectedExcludes) {
+			t.Errorf("getOverrides(%q) excludes = %v, want %v", test.url, excludes, test.expectedExcludes)
+		}
+	}
+}
+
+func TestExtractContentWithCSS(t *testing.T) {
+	// Initialize logger for testing
+	logger = log.New(io.Discard, "", 0)
+
+	html := `
+		<html>
+			<body>
+				<main>
+					<h1>Main Content</h1>
+					<p>This is the main content.</p>
+					<div class="ads">Advertisement</div>
+				</main>
+				<aside>Sidebar content</aside>
+			</body>
+		</html>
+	`
+
+	tests := []struct {
+		includeSelector  string
+		excludeSelectors []string
+		expected         string
+	}{
+		{"main", nil, "<h1>Main Content</h1>\n<p>This is the main content.</p>\n<div class=\"ads\">Advertisement</div>"},
+		{"main", []string{".ads"}, "<h1>Main Content</h1>\n<p>This is the main content.</p>"},
+		{"aside", nil, "Sidebar content"},
+	}
+
+	for _, test := range tests {
+		result, err := ExtractContentWithCSS(html, test.includeSelector, test.excludeSelectors)
+		if err != nil {
+			t.Errorf("ExtractContentWithCSS() returned error: %v", err)
+			continue
+		}
+		if strings.TrimSpace(result) != strings.TrimSpace(test.expected) {
+			t.Errorf("ExtractContentWithCSS() = %q, want %q", result, test.expected)
+		}
+	}
+}
+
+func TestProcessHTMLContent(t *testing.T) {
+	html := `
+		<html>
+			<body>
+				<h1>Test Heading</h1>
+				<p>This is a <strong>test</strong> paragraph.</p>
+				<ul>
+					<li>Item 1</li>
+					<li>Item 2</li>
+				</ul>
+			</body>
+		</html>
+	`
+
+	expected := strings.TrimSpace(`
+# Test Heading
+
+This is a **test** paragraph.
+
+- Item 1
+- Item 2
+	`)
+
+	result, err := ProcessHTMLContent(html, Config{})
+	if err != nil {
+		t.Fatalf("ProcessHTMLContent() returned error: %v", err)
+	}
+
+	if strings.TrimSpace(result) != expected {
+		t.Errorf("ProcessHTMLContent() = %q, want %q", result, expected)
+	}
+}
+
+func TestExtractLinks(t *testing.T) {
+	// Initialize Playwright before running the test
+	if err := InitPlaywright(); err != nil {
+		t.Fatalf("Failed to initialize Playwright: %v", err)
+	}
+	defer ClosePlaywright()
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/html")
+		w.Write([]byte(`
+			<html>
+				<body>
+					<a href="https://example.com/page1">Page 1</a>
+					<a href="https://example.com/page2">Page 2</a>
+					<a href="https://othersite.com">Other Site</a>
+				</body>
+			</html>
+		`))
+	}))
+	defer server.Close()
+
+	links, err := ExtractLinks(server.URL)
+	if err != nil {
+		t.Fatalf("ExtractLinks() returned error: %v", err)
+	}
+
+	expectedLinks := []string{
+		"https://example.com/page1",
+		"https://example.com/page2",
+		"https://othersite.com",
+	}
+
+	if !reflect.DeepEqual(links, expectedLinks) {
+		t.Errorf("ExtractLinks() = %v, want %v", links, expectedLinks)
+	}
+}
--- a/main.go
+++ b/main.go
@@ -20,7 +20,7 @@ func main() {
 	var err error

 	if !isHelpCommand {
-		configPath := "rollup.yml"
+		configPath := config.DefaultConfigPath()
 		cfg, err = config.Load(configPath)
 		if err != nil {
 			log.Printf("Warning: Failed to load configuration: %v", err)
Author	SHA1	Message	Date
Arik Jones (aider)	e42ad24999	docs: update configuration section in README.md to include scrape parameters and example usage	2024-09-30 14:20:17 -05:00
Arik Jones (aider)	01465a08b7	fix: set default values for requests_per_second and burst_limit in configuration to prevent rate limiter errors	2024-09-30 14:19:00 -05:00
Arik Jones (aider)	e3355269b8	refactor: remove redundant scraping functions and update runWeb to utilize scraper.ScrapeSites for improved maintainability	2024-09-30 14:10:37 -05:00
Arik Jones (aider)	54c3776baf	fix: update scrapeSingleURL calls to include visited map and currentDepth for thread safety and correct functionality	2024-09-30 14:08:16 -05:00
Arik Jones (aider)	ee1561c502	feat: add LinksContainerSelector to SiteConfig and enhance scraping logic with depth control and link extraction	2024-09-30 14:05:10 -05:00
Arik Jones (aider)	5e8a257ff8	feat: implement links container selector for targeted scraping of linked content	2024-09-30 14:04:41 -05:00