package scraper import ( "context" "fmt" "io" "log" "math/rand" "net/url" "os" "path/filepath" "regexp" "strings" "sync" "time" md "github.com/JohannesKaufmann/html-to-markdown" "github.com/PuerkitoBio/goquery" "github.com/playwright-community/playwright-go" "golang.org/x/time/rate" ) var logger *log.Logger var ( pw *playwright.Playwright browser playwright.Browser ) // Config holds the scraper configuration type Config struct { Sites []SiteConfig OutputType string Verbose bool Scrape ScrapeConfig } // ScrapeConfig holds the scraping-specific configuration type ScrapeConfig struct { RequestsPerSecond float64 BurstLimit int } // SiteConfig holds configuration for a single site type SiteConfig struct { BaseURL string CSSLocator string ExcludeSelectors []string AllowedPaths []string ExcludePaths []string FileNamePrefix string PathOverrides []PathOverride } // PathOverride holds path-specific overrides type PathOverride struct { Path string CSSLocator string ExcludeSelectors []string } func ScrapeSites(config Config) error { logger.Println("Starting ScrapeSites function - Verbose mode is active") results := make(chan struct { url string content string site SiteConfig // Add site config to track which site the content came from err error }) limiter := rate.NewLimiter(rate.Limit(config.Scrape.RequestsPerSecond), config.Scrape.BurstLimit) logger.Printf("Rate limiter configured with %f requests per second and burst limit of %d\n", config.Scrape.RequestsPerSecond, config.Scrape.BurstLimit) var wg sync.WaitGroup totalURLs := 0 for _, site := range config.Sites { totalURLs += len(site.AllowedPaths) } for _, site := range config.Sites { logger.Printf("Processing site: %s\n", site.BaseURL) wg.Add(1) go func(site SiteConfig) { defer wg.Done() for _, path := range site.AllowedPaths { fullURL := site.BaseURL + path logger.Printf("Queueing URL for scraping: %s\n", fullURL) scrapeSingleURL(fullURL, site, results, limiter) } }(site) } go func() { wg.Wait() close(results) logger.Println("All goroutines completed, results channel closed") }() // Use a map that includes site configuration scrapedContent := make(map[string]struct { content string site SiteConfig }) for result := range results { if result.err != nil { logger.Printf("Error scraping %s: %v\n", result.url, result.err) continue } logger.Printf("Successfully scraped content from %s (length: %d)\n", result.url, len(result.content)) scrapedContent[result.url] = struct { content string site SiteConfig }{ content: result.content, site: result.site, } } logger.Printf("Total URLs processed: %d\n", totalURLs) logger.Printf("Successfully scraped content from %d URLs\n", len(scrapedContent)) return SaveToFiles(scrapedContent, config) } func scrapeSingleURL(url string, site SiteConfig, results chan<- struct { url string content string site SiteConfig err error }, limiter *rate.Limiter) { logger.Printf("Starting to scrape URL: %s\n", url) err := limiter.Wait(context.Background()) if err != nil { results <- struct { url string content string site SiteConfig err error }{url, "", site, fmt.Errorf("rate limiter error: %v", err)} return } cssLocator, excludeSelectors := getOverrides(url, site) content, err := scrapeURL(url, cssLocator, excludeSelectors) if err != nil { results <- struct { url string content string site SiteConfig err error }{url, "", site, err} return } results <- struct { url string content string site SiteConfig err error }{url, content, site, nil} } func isAllowedURL(urlStr string, site SiteConfig) bool { parsedURL, err := url.Parse(urlStr) if err != nil { return false } baseURL, _ := url.Parse(site.BaseURL) if parsedURL.Host != baseURL.Host { return false } path := parsedURL.Path for _, allowedPath := range site.AllowedPaths { if strings.HasPrefix(path, allowedPath) { for _, excludePath := range site.ExcludePaths { if strings.HasPrefix(path, excludePath) { return false } } return true } } return false } func getOverrides(urlStr string, site SiteConfig) (string, []string) { parsedURL, _ := url.Parse(urlStr) path := parsedURL.Path for _, override := range site.PathOverrides { if strings.HasPrefix(path, override.Path) { if override.CSSLocator != "" { return override.CSSLocator, override.ExcludeSelectors } return site.CSSLocator, override.ExcludeSelectors } } return site.CSSLocator, site.ExcludeSelectors } func scrapeURL(url, cssLocator string, excludeSelectors []string) (string, error) { content, err := FetchWebpageContent(url) if err != nil { return "", err } if cssLocator != "" { content, err = ExtractContentWithCSS(content, cssLocator, excludeSelectors) if err != nil { return "", err } } return ProcessHTMLContent(content, Config{}) } func getFilenameFromContent(content, url string) string { // Try to extract title from content titleStart := strings.Index(content, "") titleEnd := strings.Index(content, "") if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart { title := content[titleStart+7 : titleEnd] return sanitizeFilename(title) + ".md" } // If no title found, use the URL return sanitizeFilename(url) + ".md" } func sanitizeFilename(name string) string { // Replace all non-alphanumeric characters with dashes reg := regexp.MustCompile("[^a-zA-Z0-9]+") name = reg.ReplaceAllString(name, "-") // Remove any leading or trailing dashes name = strings.Trim(name, "-") // Collapse multiple consecutive dashes into one reg = regexp.MustCompile("-+") return reg.ReplaceAllString(name, "-") } // URLConfig holds configuration for a single URL type URLConfig struct { URL string CSSLocator string ExcludeSelectors []string FileNamePrefix string } // SetupLogger initializes the logger based on the verbose flag func SetupLogger(verbose bool) { if verbose { logger = log.New(os.Stdout, "SCRAPER: ", log.LstdFlags) } else { logger = log.New(io.Discard, "", 0) } } // InitPlaywright initializes Playwright and launches the browser func InitPlaywright() error { logger.Println("Initializing Playwright") var err error // Install Playwright and Chromium browser err = playwright.Install(&playwright.RunOptions{Browsers: []string{"chromium"}}) if err != nil { return fmt.Errorf("could not install Playwright and Chromium: %v", err) } pw, err = playwright.Run() if err != nil { return fmt.Errorf("could not start Playwright: %v", err) } userAgent := "Mozilla/5.0 (Linux; Android 15; Pixel 9 Build/AP3A.241105.008) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.6723.106 Mobile Safari/537.36 OPX/2.5" browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{ Args: []string{fmt.Sprintf("--user-agent=%s", userAgent)}, }) if err != nil { return fmt.Errorf("could not launch browser: %v", err) } logger.Println("Playwright initialized successfully") return nil } // ClosePlaywright closes the browser and stops Playwright func ClosePlaywright() { if browser != nil { browser.Close() } if pw != nil { pw.Stop() } } // InitBrowser initializes the browser func InitBrowser() error { return InitPlaywright() } // CloseBrowser closes the browser func CloseBrowser() { ClosePlaywright() } // SaveToFiles writes the scraped content to files based on output type func SaveToFiles(content map[string]struct { content string site SiteConfig }, config Config) error { if config.OutputType == "" { config.OutputType = "separate" // default to separate files if not specified } switch config.OutputType { case "single": if err := os.MkdirAll("output", 0755); err != nil { return fmt.Errorf("failed to create output directory: %v", err) } var combined strings.Builder for url, data := range content { combined.WriteString(fmt.Sprintf("## %s\n\n", url)) combined.WriteString(data.content) combined.WriteString("\n\n") } return os.WriteFile(filepath.Join("output", "combined.md"), []byte(combined.String()), 0644) case "separate": if err := os.MkdirAll("output", 0755); err != nil { return fmt.Errorf("failed to create output directory: %v", err) } // Group content by site and path contentBySitePath := make(map[string]map[string]string) for urlStr, data := range content { parsedURL, err := url.Parse(urlStr) if err != nil { logger.Printf("Warning: Could not parse URL %s: %v", urlStr, err) continue } // Find matching allowed path for this URL var matchingPath string for _, path := range data.site.AllowedPaths { if strings.HasPrefix(parsedURL.Path, path) { matchingPath = path break } } if matchingPath == "" { logger.Printf("Warning: No matching allowed path for URL %s", urlStr) continue } siteKey := fmt.Sprintf("%s-%s", data.site.BaseURL, data.site.FileNamePrefix) if contentBySitePath[siteKey] == nil { contentBySitePath[siteKey] = make(map[string]string) } // Combine all content for the same path if existing, exists := contentBySitePath[siteKey][matchingPath]; exists { contentBySitePath[siteKey][matchingPath] = existing + "\n\n" + data.content } else { contentBySitePath[siteKey][matchingPath] = data.content } } // Write files for each site and path for siteKey, pathContent := range contentBySitePath { for path, content := range pathContent { parts := strings.SplitN(siteKey, "-", 2) // Split only on first hyphen prefix := parts[1] // Get the FileNamePrefix part if prefix == "" { prefix = "doc" // default prefix if none specified } normalizedPath := NormalizePathForFilename(path) if normalizedPath == "" { normalizedPath = "index" } filename := filepath.Join("output", fmt.Sprintf("%s-%s.md", prefix, normalizedPath)) // Ensure we don't have empty files if strings.TrimSpace(content) == "" { logger.Printf("Skipping empty content for path %s", path) continue } if err := os.WriteFile(filename, []byte(content), 0644); err != nil { return fmt.Errorf("failed to write file %s: %v", filename, err) } logger.Printf("Wrote content to %s", filename) } } return nil default: return fmt.Errorf("unsupported output type: %s", config.OutputType) } } // NormalizePathForFilename converts a URL path into a valid filename component func NormalizePathForFilename(urlPath string) string { // Remove leading/trailing slashes path := strings.Trim(urlPath, "/") // Replace all non-alphanumeric characters with dashes reg := regexp.MustCompile("[^a-zA-Z0-9]+") path = reg.ReplaceAllString(path, "-") // Remove any leading or trailing dashes path = strings.Trim(path, "-") // Collapse multiple consecutive dashes into one reg = regexp.MustCompile("-+") path = reg.ReplaceAllString(path, "-") return path } // FetchWebpageContent retrieves the content of a webpage using Playwright func FetchWebpageContent(urlStr string) (string, error) { logger.Printf("Fetching webpage content for URL: %s\n", urlStr) page, err := browser.NewPage() if err != nil { logger.Printf("Error creating new page: %v\n", err) return "", fmt.Errorf("could not create page: %v", err) } defer page.Close() time.Sleep(time.Duration(rand.Intn(2000)+1000) * time.Millisecond) logger.Printf("Navigating to URL: %s\n", urlStr) if _, err = page.Goto(urlStr, playwright.PageGotoOptions{ WaitUntil: playwright.WaitUntilStateNetworkidle, }); err != nil { logger.Printf("Error navigating to page: %v\n", err) return "", fmt.Errorf("could not go to page: %v", err) } logger.Println("Waiting for page load state") err = page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ State: playwright.LoadStateNetworkidle, }) if err != nil { logger.Printf("Error waiting for page load: %v\n", err) return "", fmt.Errorf("error waiting for page load: %v", err) } logger.Println("Scrolling page") err = scrollPage(page) if err != nil { logger.Printf("Error scrolling page: %v\n", err) return "", fmt.Errorf("error scrolling page: %v", err) } logger.Println("Waiting for body element") bodyElement := page.Locator("body") err = bodyElement.WaitFor(playwright.LocatorWaitForOptions{ State: playwright.WaitForSelectorStateVisible, }) if err != nil { logger.Printf("Error waiting for body: %v\n", err) return "", fmt.Errorf("error waiting for body: %v", err) } logger.Println("Getting page content") content, err := page.Content() if err != nil { logger.Printf("Error getting page content: %v\n", err) return "", fmt.Errorf("could not get page content: %v", err) } if content == "" { logger.Println(" content is empty, falling back to body content") content, err = bodyElement.InnerHTML() if err != nil { logger.Printf("Error getting body content: %v\n", err) return "", fmt.Errorf("could not get body content: %v", err) } } logger.Printf("Successfully fetched webpage content (length: %d)\n", len(content)) return content, nil } // ProcessHTMLContent converts HTML content to Markdown func ProcessHTMLContent(htmlContent string, config Config) (string, error) { logger.Printf("Processing HTML content (length: %d)\n", len(htmlContent)) doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) if err != nil { logger.Printf("Error parsing HTML: %v\n", err) return "", fmt.Errorf("error parsing HTML: %v", err) } selection := doc.Find("body") logger.Println("Processing entire body") if selection.Length() == 0 { return "", fmt.Errorf("no content found in the document") } content, err := selection.Html() if err != nil { logger.Printf("Error extracting content: %v\n", err) return "", fmt.Errorf("error extracting content: %v", err) } // Create a new converter converter := md.NewConverter("", true, nil) // Convert HTML to Markdown markdown, err := converter.ConvertString(content) if err != nil { logger.Printf("Error converting HTML to Markdown: %v\n", err) return "", fmt.Errorf("error converting HTML to Markdown: %v", err) } logger.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown)) return markdown, nil } func scrollPage(page playwright.Page) error { logger.Println("Starting page scroll") script := ` () => { window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight; } ` previousHeight := 0 for i := 0; i < 250; i++ { height, err := page.Evaluate(script) if err != nil { logger.Printf("Error scrolling (iteration %d): %v\n", i+1, err) return fmt.Errorf("error scrolling: %v", err) } var currentHeight int switch v := height.(type) { case int: currentHeight = v case float64: currentHeight = int(v) default: logger.Printf("Unexpected height type: %T\n", height) return fmt.Errorf("unexpected height type: %T", height) } logger.Printf("Scroll iteration %d: height = %d\n", i+1, currentHeight) if currentHeight == previousHeight { logger.Println("Reached bottom of the page") break } previousHeight = currentHeight // Wait for content to load before scrolling again time.Sleep(100 * time.Millisecond) } logger.Println("Scrolling back to top") _, err := page.Evaluate(`() => { window.scrollTo(0, 0); }`) if err != nil { logger.Printf("Error scrolling back to top: %v\n", err) return fmt.Errorf("error scrolling back to top: %v", err) } logger.Println("Page scroll completed") return nil } // ExtractContentWithCSS extracts content from HTML using a CSS selector func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) { logger.Printf("Extracting content with CSS selector: %s\n", includeSelector) doc, err := goquery.NewDocumentFromReader(strings.NewReader(content)) if err != nil { return "", fmt.Errorf("error parsing HTML: %v", err) } selection := doc.Find(includeSelector) if selection.Length() == 0 { logger.Printf("Warning: No content found with CSS selector: %s. Falling back to body content.\n", includeSelector) selection = doc.Find("body") if selection.Length() == 0 { return "", fmt.Errorf("no content found in body") } } for _, excludeSelector := range excludeSelectors { selection.Find(excludeSelector).Remove() } selectedContent, err := selection.Html() if err != nil { return "", fmt.Errorf("error extracting content with CSS selector: %v", err) } // Trim leading and trailing whitespace selectedContent = strings.TrimSpace(selectedContent) // Normalize newlines selectedContent = strings.ReplaceAll(selectedContent, "\r\n", "\n") selectedContent = strings.ReplaceAll(selectedContent, "\r", "\n") // Remove indentation while preserving structure lines := strings.Split(selectedContent, "\n") for i, line := range lines { lines[i] = strings.TrimSpace(line) } selectedContent = strings.Join(lines, "\n") // Remove any leading or trailing newlines selectedContent = strings.Trim(selectedContent, "\n") logger.Printf("Extracted content length: %d\n", len(selectedContent)) return selectedContent, nil }