diff --git a/cmd/web.go b/cmd/web.go index 53cca48..9387e7a 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -204,45 +204,17 @@ func generateDefaultFilename() string { return fmt.Sprintf("web-%s.rollup.md", timestamp) } -func scrapeRecursively(urlStr string, depth int) (string, error) { - visited := make(map[string]bool) - return scrapeURL(urlStr, depth, visited) -} - -func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error) { - if depth < 0 || visited[urlStr] { - return "", nil - } - - visited[urlStr] = true - +func scrapeURL(urlStr string) (string, error) { content, err := testExtractAndConvertContent(urlStr) if err != nil { return "", err } - if depth > 0 { - links, err := testExtractLinks(urlStr) - if err != nil { - return content, fmt.Errorf("error extracting links: %v", err) - } - - for _, link := range links { - subContent, err := scrapeURL(link, depth-1, visited) - if err != nil { - fmt.Printf("Warning: Error scraping %s: %v\n", link, err) - continue - } - content += "\n\n---\n\n" + subContent - } - } - return content, nil } var ( testExtractAndConvertContent = extractAndConvertContent - testExtractLinks = scraper.ExtractLinks ) func extractAndConvertContent(urlStr string) (string, error) { diff --git a/internal/config/config.go b/internal/config/config.go index dc18838..3b57b5f 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -42,9 +42,6 @@ type SiteConfig struct { // ExcludeSelectors lists CSS selectors for content to exclude ExcludeSelectors []string `yaml:"exclude_selectors"` - // MaxDepth sets the maximum depth for link traversal - MaxDepth int `yaml:"max_depth"` - // AllowedPaths lists paths that are allowed to be scraped AllowedPaths []string `yaml:"allowed_paths"` @@ -103,9 +100,6 @@ func (c *Config) Validate() error { if site.BaseURL == "" { return fmt.Errorf("base_url must be specified for each site") } - if site.MaxDepth < 0 { - return fmt.Errorf("max_depth must be non-negative") - } } return nil diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 1e145e5..270e340 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -45,7 +45,6 @@ type SiteConfig struct { BaseURL string CSSLocator string ExcludeSelectors []string - MaxDepth int AllowedPaths []string ExcludePaths []string OutputAlias string @@ -510,40 +509,6 @@ func scrollPage(page playwright.Page) error { return nil } -// ExtractLinks extracts all links from the given URL -func ExtractLinks(urlStr string) ([]string, error) { - logger.Printf("Extracting links from URL: %s\n", urlStr) - - page, err := browser.NewPage() - if err != nil { - return nil, fmt.Errorf("could not create page: %v", err) - } - defer page.Close() - - if _, err = page.Goto(urlStr, playwright.PageGotoOptions{ - WaitUntil: playwright.WaitUntilStateNetworkidle, - }); err != nil { - return nil, fmt.Errorf("could not go to page: %v", err) - } - - links, err := page.Evaluate(`() => { - const anchors = document.querySelectorAll('a'); - return Array.from(anchors).map(a => a.href); - }`) - if err != nil { - return nil, fmt.Errorf("could not extract links: %v", err) - } - - var result []string - for _, link := range links.([]interface{}) { - // Normalize URL by removing trailing slash - normalizedLink := strings.TrimRight(link.(string), "/") - result = append(result, normalizedLink) - } - - logger.Printf("Extracted %d links\n", len(result)) - return result, nil -} // ExtractContentWithCSS extracts content from HTML using a CSS selector func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) {