From 333b9a366cf2c8e1c25c5183a3ba7c6a0d02077b Mon Sep 17 00:00:00 2001 From: Arik Jones Date: Tue, 24 Sep 2024 15:13:36 -0500 Subject: [PATCH] fix: Resolve playwright function deprecations and io/ioutil function deprecations. --- cmd/web.go | 174 +++++++-------- cmd/web_test.go | 2 +- internal/config/config_test.go | 4 +- internal/scraper/scraper.go | 354 ++++++++++++++++--------------- internal/scraper/scraper_test.go | 16 +- 5 files changed, 280 insertions(+), 270 deletions(-) diff --git a/cmd/web.go b/cmd/web.go index f35418c..ecc24d4 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -2,7 +2,7 @@ package cmd import ( "fmt" - "io/ioutil" + "io" "log" "net/url" "os" @@ -41,93 +41,93 @@ func init() { } func runWeb(cmd *cobra.Command, args []string) error { - scraper.SetupLogger(verbose) - logger := log.New(os.Stdout, "WEB: ", log.LstdFlags) - if !verbose { - logger.SetOutput(ioutil.Discard) - } - logger.Printf("Starting web scraping process with verbose mode: %v", verbose) - scraperConfig.Verbose = verbose + scraper.SetupLogger(verbose) + logger := log.New(os.Stdout, "WEB: ", log.LstdFlags) + if !verbose { + logger.SetOutput(io.Discard) + } + logger.Printf("Starting web scraping process with verbose mode: %v", verbose) + scraperConfig.Verbose = verbose - var siteConfigs []scraper.SiteConfig - if len(cfg.Scrape.Sites) > 0 { - logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites)) - siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites)) - for i, site := range cfg.Scrape.Sites { - siteConfigs[i] = scraper.SiteConfig{ - BaseURL: site.BaseURL, - CSSLocator: site.CSSLocator, - ExcludeSelectors: site.ExcludeSelectors, - MaxDepth: site.MaxDepth, - AllowedPaths: site.AllowedPaths, - ExcludePaths: site.ExcludePaths, - OutputAlias: site.OutputAlias, - PathOverrides: convertPathOverrides(site.PathOverrides), - } - logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d, AllowedPaths=%v", - i+1, site.BaseURL, site.CSSLocator, site.MaxDepth, site.AllowedPaths) - } - } else { - logger.Printf("No sites defined in rollup.yml, falling back to URL-based configuration") - siteConfigs = make([]scraper.SiteConfig, len(urls)) - for i, u := range urls { - siteConfigs[i] = scraper.SiteConfig{ - BaseURL: u, - CSSLocator: includeSelector, - ExcludeSelectors: excludeSelectors, - MaxDepth: depth, - } - logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d", - i+1, u, includeSelector, depth) - } - } + var siteConfigs []scraper.SiteConfig + if len(cfg.Scrape.Sites) > 0 { + logger.Printf("Using configuration from rollup.yml for %d sites", len(cfg.Scrape.Sites)) + siteConfigs = make([]scraper.SiteConfig, len(cfg.Scrape.Sites)) + for i, site := range cfg.Scrape.Sites { + siteConfigs[i] = scraper.SiteConfig{ + BaseURL: site.BaseURL, + CSSLocator: site.CSSLocator, + ExcludeSelectors: site.ExcludeSelectors, + MaxDepth: site.MaxDepth, + AllowedPaths: site.AllowedPaths, + ExcludePaths: site.ExcludePaths, + OutputAlias: site.OutputAlias, + PathOverrides: convertPathOverrides(site.PathOverrides), + } + logger.Printf("Site %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d, AllowedPaths=%v", + i+1, site.BaseURL, site.CSSLocator, site.MaxDepth, site.AllowedPaths) + } + } else { + logger.Printf("No sites defined in rollup.yml, falling back to URL-based configuration") + siteConfigs = make([]scraper.SiteConfig, len(urls)) + for i, u := range urls { + siteConfigs[i] = scraper.SiteConfig{ + BaseURL: u, + CSSLocator: includeSelector, + ExcludeSelectors: excludeSelectors, + MaxDepth: depth, + } + logger.Printf("URL %d configuration: BaseURL=%s, CSSLocator=%s, MaxDepth=%d", + i+1, u, includeSelector, depth) + } + } - if len(siteConfigs) == 0 { - logger.Println("Error: No sites or URLs provided") - return fmt.Errorf("no sites or URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.sites' in the rollup.yml file") - } + if len(siteConfigs) == 0 { + logger.Println("Error: No sites or URLs provided") + return fmt.Errorf("no sites or URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.sites' in the rollup.yml file") + } - // Set default values for rate limiting - defaultRequestsPerSecond := 1.0 - defaultBurstLimit := 3 + // Set default values for rate limiting + defaultRequestsPerSecond := 1.0 + defaultBurstLimit := 3 - // Use default values if not set in the configuration - requestsPerSecond := cfg.Scrape.RequestsPerSecond - if requestsPerSecond == 0 { - requestsPerSecond = defaultRequestsPerSecond - } - burstLimit := cfg.Scrape.BurstLimit - if burstLimit == 0 { - burstLimit = defaultBurstLimit - } + // Use default values if not set in the configuration + requestsPerSecond := cfg.Scrape.RequestsPerSecond + if requestsPerSecond == 0 { + requestsPerSecond = defaultRequestsPerSecond + } + burstLimit := cfg.Scrape.BurstLimit + if burstLimit == 0 { + burstLimit = defaultBurstLimit + } - scraperConfig := scraper.Config{ - Sites: siteConfigs, - OutputType: outputType, - Verbose: verbose, - Scrape: scraper.ScrapeConfig{ - RequestsPerSecond: requestsPerSecond, - BurstLimit: burstLimit, - }, - } - logger.Printf("Scraper configuration: OutputType=%s, RequestsPerSecond=%f, BurstLimit=%d", - outputType, requestsPerSecond, burstLimit) + scraperConfig := scraper.Config{ + Sites: siteConfigs, + OutputType: outputType, + Verbose: verbose, + Scrape: scraper.ScrapeConfig{ + RequestsPerSecond: requestsPerSecond, + BurstLimit: burstLimit, + }, + } + logger.Printf("Scraper configuration: OutputType=%s, RequestsPerSecond=%f, BurstLimit=%d", + outputType, requestsPerSecond, burstLimit) - logger.Println("Starting scraping process") - scrapedContent, err := scraper.ScrapeSites(scraperConfig) - if err != nil { - logger.Printf("Error occurred during scraping: %v", err) - return fmt.Errorf("error scraping content: %v", err) - } - logger.Printf("Scraping completed. Total content scraped: %d", len(scrapedContent)) + logger.Println("Starting scraping process") + scrapedContent, err := scraper.ScrapeSites(scraperConfig) + if err != nil { + logger.Printf("Error occurred during scraping: %v", err) + return fmt.Errorf("error scraping content: %v", err) + } + logger.Printf("Scraping completed. Total content scraped: %d", len(scrapedContent)) - if outputType == "single" { - logger.Println("Writing content to a single file") - return writeSingleFile(scrapedContent) - } else { - logger.Println("Writing content to multiple files") - return writeMultipleFiles(scrapedContent) - } + if outputType == "single" { + logger.Println("Writing content to a single file") + return writeSingleFile(scrapedContent) + } else { + logger.Println("Writing content to multiple files") + return writeMultipleFiles(scrapedContent) + } } func writeSingleFile(content map[string]string) error { @@ -139,7 +139,7 @@ func writeSingleFile(content map[string]string) error { defer file.Close() for url, c := range content { - _, err = fmt.Fprintf(file, "# Content from %s\n\n%s\n\n---\n\n", url, c) + _, err = fmt.Fprintf(file, "# ::: Content from %s\n\n%s\n\n---\n\n", url, c) if err != nil { return fmt.Errorf("error writing content to file: %v", err) } @@ -161,7 +161,7 @@ func writeMultipleFiles(content map[string]string) error { return fmt.Errorf("error creating output file %s: %v", filename, err) } - _, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s\n", url, c)) + _, err = file.WriteString(fmt.Sprintf("# ::: Content from %s\n\n%s\n", url, c)) if err != nil { file.Close() return fmt.Errorf("error writing content to file %s: %v", filename, err) @@ -215,8 +215,10 @@ func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error return content, nil } -var testExtractAndConvertContent = extractAndConvertContent -var testExtractLinks = scraper.ExtractLinks +var ( + testExtractAndConvertContent = extractAndConvertContent + testExtractLinks = scraper.ExtractLinks +) func extractAndConvertContent(urlStr string) (string, error) { content, err := scraper.FetchWebpageContent(urlStr) @@ -240,7 +242,7 @@ func extractAndConvertContent(urlStr string) (string, error) { if err != nil { return "", fmt.Errorf("error parsing URL: %v", err) } - header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String()) + header := fmt.Sprintf("# ::: Content from %s\n\n", parsedURL.String()) return header + markdown + "\n\n", nil } diff --git a/cmd/web_test.go b/cmd/web_test.go index 8e470be..3852487 100644 --- a/cmd/web_test.go +++ b/cmd/web_test.go @@ -103,7 +103,7 @@ func mockExtractAndConvertContent(urlStr string) (string, error) { return "Mocked content for " + urlStr, nil } -func mockExtractLinks(urlStr string) ([]string, error) { +func mockExtractLinks() ([]string, error) { return []string{"http://example.com/link1", "http://example.com/link2"}, nil } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index a05c23f..483c333 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -45,10 +45,10 @@ scrape: } defer os.Remove(tmpfile.Name()) - if _, err := tmpfile.Write(content); err != nil { + if _, err = tmpfile.Write(content); err != nil { t.Fatalf("Failed to write to temp file: %v", err) } - if err := tmpfile.Close(); err != nil { + if err = tmpfile.Close(); err != nil { t.Fatalf("Failed to close temp file: %v", err) } diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index e5413e0..1e145e5 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -1,21 +1,21 @@ package scraper import ( + "context" "fmt" - "io/ioutil" + "io" "log" "math/rand" "net/url" "os" "regexp" "strings" - "time" "sync" - "context" + "time" + md "github.com/JohannesKaufmann/html-to-markdown" "github.com/PuerkitoBio/goquery" "github.com/playwright-community/playwright-go" - md "github.com/JohannesKaufmann/html-to-markdown" "golang.org/x/time/rate" ) @@ -60,208 +60,210 @@ type PathOverride struct { } func ScrapeSites(config Config) (map[string]string, error) { - logger.Println("Starting ScrapeSites function - Verbose mode is active") - results := make(chan struct { - url string - content string - err error - }) + logger.Println("Starting ScrapeSites function - Verbose mode is active") + results := make(chan struct { + url string + content string + err error + }) - limiter := rate.NewLimiter(rate.Limit(config.Scrape.RequestsPerSecond), config.Scrape.BurstLimit) - logger.Printf("Rate limiter configured with %f requests per second and burst limit of %d\n", config.Scrape.RequestsPerSecond, config.Scrape.BurstLimit) + limiter := rate.NewLimiter(rate.Limit(config.Scrape.RequestsPerSecond), config.Scrape.BurstLimit) + logger.Printf("Rate limiter configured with %f requests per second and burst limit of %d\n", config.Scrape.RequestsPerSecond, config.Scrape.BurstLimit) - var wg sync.WaitGroup - totalURLs := 0 - for _, site := range config.Sites { - logger.Printf("Processing site: %s\n", site.BaseURL) - wg.Add(1) - go func(site SiteConfig) { - defer wg.Done() - for _, path := range site.AllowedPaths { - fullURL := site.BaseURL + path - totalURLs++ - logger.Printf("Queueing URL for scraping: %s\n", fullURL) - scrapeSingleURL(fullURL, site, config, results, limiter) - } - }(site) - } + var wg sync.WaitGroup + totalURLs := 0 + for _, site := range config.Sites { + logger.Printf("Processing site: %s\n", site.BaseURL) + wg.Add(1) + go func(site SiteConfig) { + defer wg.Done() + for _, path := range site.AllowedPaths { + fullURL := site.BaseURL + path + totalURLs++ + logger.Printf("Queueing URL for scraping: %s\n", fullURL) + scrapeSingleURL(fullURL, site, results, limiter) + } + }(site) + } - go func() { - wg.Wait() - close(results) - logger.Println("All goroutines completed, results channel closed") - }() + go func() { + wg.Wait() + close(results) + logger.Println("All goroutines completed, results channel closed") + }() - scrapedContent := make(map[string]string) - for result := range results { - if result.err != nil { - logger.Printf("Error scraping %s: %v\n", result.url, result.err) - continue - } - logger.Printf("Successfully scraped content from %s (length: %d)\n", result.url, len(result.content)) - scrapedContent[result.url] = result.content - } + scrapedContent := make(map[string]string) + for result := range results { + if result.err != nil { + logger.Printf("Error scraping %s: %v\n", result.url, result.err) + continue + } + logger.Printf("Successfully scraped content from %s (length: %d)\n", result.url, len(result.content)) + scrapedContent[result.url] = result.content + } - logger.Printf("Total URLs processed: %d\n", totalURLs) - logger.Printf("Successfully scraped content from %d URLs\n", len(scrapedContent)) + logger.Printf("Total URLs processed: %d\n", totalURLs) + logger.Printf("Successfully scraped content from %d URLs\n", len(scrapedContent)) - return scrapedContent, nil + return scrapedContent, nil } -func scrapeSingleURL(url string, site SiteConfig, config Config, results chan<- struct { - url string - content string - err error -}, limiter *rate.Limiter) { - logger.Printf("Starting to scrape URL: %s\n", url) +func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { + url string + content string + err error +}, limiter *rate.Limiter, +) { + logger.Printf("Starting to scrape URL: %s\n", url) - // Wait for rate limiter before making the request - err := limiter.Wait(context.Background()) - if err != nil { - logger.Printf("Rate limiter error for %s: %v\n", url, err) - results <- struct { - url string - content string - err error - }{url, "", fmt.Errorf("rate limiter error: %v", err)} - return - } + // Wait for rate limiter before making the request + err := limiter.Wait(context.Background()) + if err != nil { + logger.Printf("Rate limiter error for %s: %v\n", url, err) + results <- struct { + url string + content string + err error + }{url, "", fmt.Errorf("rate limiter error: %v", err)} + return + } - cssLocator, excludeSelectors := getOverrides(url, site) - logger.Printf("Using CSS locator for %s: %s\n", url, cssLocator) - logger.Printf("Exclude selectors for %s: %v\n", url, excludeSelectors) + cssLocator, excludeSelectors := getOverrides(url, site) + logger.Printf("Using CSS locator for %s: %s\n", url, cssLocator) + logger.Printf("Exclude selectors for %s: %v\n", url, excludeSelectors) - content, err := scrapeURL(url, cssLocator, excludeSelectors) - if err != nil { - logger.Printf("Error scraping %s: %v\n", url, err) - results <- struct { - url string - content string - err error - }{url, "", err} - return - } + content, err := scrapeURL(url, cssLocator, excludeSelectors) + if err != nil { + logger.Printf("Error scraping %s: %v\n", url, err) + results <- struct { + url string + content string + err error + }{url, "", err} + return + } - if content == "" { - logger.Printf("Warning: Empty content scraped from %s\n", url) - } else { - logger.Printf("Successfully scraped content from %s (length: %d)\n", url, len(content)) - } + if content == "" { + logger.Printf("Warning: Empty content scraped from %s\n", url) + } else { + logger.Printf("Successfully scraped content from %s (length: %d)\n", url, len(content)) + } - results <- struct { - url string - content string - err error - }{url, content, nil} + results <- struct { + url string + content string + err error + }{url, content, nil} } -func scrapeSite(site SiteConfig, config Config, results chan<- struct { - url string - content string - err error -}, limiter *rate.Limiter) { - visited := make(map[string]bool) - queue := []string{site.BaseURL} +func scrapeSite(site SiteConfig, results chan<- struct { + url string + content string + err error +}, limiter *rate.Limiter, +) { + visited := make(map[string]bool) + queue := []string{site.BaseURL} - for len(queue) > 0 { - url := queue[0] - queue = queue[1:] + for len(queue) > 0 { + url := queue[0] + queue = queue[1:] - if visited[url] { - continue - } - visited[url] = true + if visited[url] { + continue + } + visited[url] = true - if !isAllowedURL(url, site) { - continue - } + if !isAllowedURL(url, site) { + continue + } - // Wait for rate limiter before making the request - err := limiter.Wait(context.Background()) - if err != nil { - results <- struct { - url string - content string - err error - }{url, "", fmt.Errorf("rate limiter error: %v", err)} - continue - } + // Wait for rate limiter before making the request + err := limiter.Wait(context.Background()) + if err != nil { + results <- struct { + url string + content string + err error + }{url, "", fmt.Errorf("rate limiter error: %v", err)} + continue + } - cssLocator, excludeSelectors := getOverrides(url, site) - content, err := scrapeURL(url, cssLocator, excludeSelectors) - results <- struct { - url string - content string - err error - }{url, content, err} + cssLocator, excludeSelectors := getOverrides(url, site) + content, err := scrapeURL(url, cssLocator, excludeSelectors) + results <- struct { + url string + content string + err error + }{url, content, err} - if len(visited) < site.MaxDepth { - links, _ := ExtractLinks(url) - for _, link := range links { - if !visited[link] && isAllowedURL(link, site) { - queue = append(queue, link) - } - } - } - } + if len(visited) < site.MaxDepth { + links, _ := ExtractLinks(url) + for _, link := range links { + if !visited[link] && isAllowedURL(link, site) { + queue = append(queue, link) + } + } + } + } } func isAllowedURL(urlStr string, site SiteConfig) bool { - parsedURL, err := url.Parse(urlStr) - if err != nil { - return false - } + parsedURL, err := url.Parse(urlStr) + if err != nil { + return false + } - baseURL, _ := url.Parse(site.BaseURL) - if parsedURL.Host != baseURL.Host { - return false - } + baseURL, _ := url.Parse(site.BaseURL) + if parsedURL.Host != baseURL.Host { + return false + } - path := parsedURL.Path - for _, allowedPath := range site.AllowedPaths { - if strings.HasPrefix(path, allowedPath) { - for _, excludePath := range site.ExcludePaths { - if strings.HasPrefix(path, excludePath) { - return false - } - } - return true - } - } + path := parsedURL.Path + for _, allowedPath := range site.AllowedPaths { + if strings.HasPrefix(path, allowedPath) { + for _, excludePath := range site.ExcludePaths { + if strings.HasPrefix(path, excludePath) { + return false + } + } + return true + } + } - return false + return false } func getOverrides(urlStr string, site SiteConfig) (string, []string) { - parsedURL, _ := url.Parse(urlStr) - path := parsedURL.Path + parsedURL, _ := url.Parse(urlStr) + path := parsedURL.Path - for _, override := range site.PathOverrides { - if strings.HasPrefix(path, override.Path) { - if override.CSSLocator != "" { - return override.CSSLocator, override.ExcludeSelectors - } - return site.CSSLocator, override.ExcludeSelectors - } - } + for _, override := range site.PathOverrides { + if strings.HasPrefix(path, override.Path) { + if override.CSSLocator != "" { + return override.CSSLocator, override.ExcludeSelectors + } + return site.CSSLocator, override.ExcludeSelectors + } + } - return site.CSSLocator, site.ExcludeSelectors + return site.CSSLocator, site.ExcludeSelectors } func scrapeURL(url, cssLocator string, excludeSelectors []string) (string, error) { - content, err := FetchWebpageContent(url) - if err != nil { - return "", err - } + content, err := FetchWebpageContent(url) + if err != nil { + return "", err + } - if cssLocator != "" { - content, err = ExtractContentWithCSS(content, cssLocator, excludeSelectors) - if err != nil { - return "", err - } - } + if cssLocator != "" { + content, err = ExtractContentWithCSS(content, cssLocator, excludeSelectors) + if err != nil { + return "", err + } + } - return ProcessHTMLContent(content, Config{}) + return ProcessHTMLContent(content, Config{}) } func getFilenameFromContent(content, url string) string { @@ -296,7 +298,7 @@ func SetupLogger(verbose bool) { if verbose { logger = log.New(os.Stdout, "SCRAPER: ", log.LstdFlags) } else { - logger = log.New(ioutil.Discard, "", 0) + logger = log.New(io.Discard, "", 0) } } @@ -387,7 +389,9 @@ func FetchWebpageContent(urlStr string) (string, error) { } logger.Println("Waiting for body element") - _, err = page.WaitForSelector("body", playwright.PageWaitForSelectorOptions{ + + bodyElement := page.Locator("body") + err = bodyElement.WaitFor(playwright.LocatorWaitForOptions{ State: playwright.WaitForSelectorStateVisible, }) if err != nil { @@ -404,7 +408,7 @@ func FetchWebpageContent(urlStr string) (string, error) { if content == "" { logger.Println(" content is empty, falling back to body content") - content, err = page.InnerHTML("body") + content, err = bodyElement.InnerHTML() if err != nil { logger.Printf("Error getting body content: %v\n", err) return "", fmt.Errorf("could not get body content: %v", err) @@ -457,6 +461,8 @@ func scrollPage(page playwright.Page) error { () => { window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight; + // wait for 500 ms + new Promise(resolve => setTimeout(resolve, 500)); } ` @@ -488,7 +494,9 @@ func scrollPage(page playwright.Page) error { previousHeight = currentHeight - page.WaitForTimeout(500) + // Wait for a while before scrolling again + + } logger.Println("Scrolling back to top") diff --git a/internal/scraper/scraper_test.go b/internal/scraper/scraper_test.go index 4d7b0a5..0420287 100644 --- a/internal/scraper/scraper_test.go +++ b/internal/scraper/scraper_test.go @@ -1,13 +1,13 @@ package scraper import ( - "testing" + "io" + "log" "net/http" "net/http/httptest" - "strings" "reflect" - "log" - "io/ioutil" + "strings" + "testing" ) func TestIsAllowedURL(t *testing.T) { @@ -51,9 +51,9 @@ func TestGetOverrides(t *testing.T) { } tests := []struct { - url string - expectedLocator string - expectedExcludes []string + url string + expectedLocator string + expectedExcludes []string }{ {"https://example.com/normal", "main", []string{".ads"}}, {"https://example.com/special", ".special-content", []string{".sidebar"}}, @@ -73,7 +73,7 @@ func TestGetOverrides(t *testing.T) { func TestExtractContentWithCSS(t *testing.T) { // Initialize logger for testing - logger = log.New(ioutil.Discard, "", 0) + logger = log.New(io.Discard, "", 0) html := `