1 Commits

5 changed files with 280 additions and 270 deletions

View File

@@ -2,7 +2,7 @@ package cmd
import (
"fmt"
"io/ioutil"
"io"
"log"
"net/url"
"os"
@@ -44,7 +44,7 @@ func runWeb(cmd *cobra.Command, args []string) error {
scraper.SetupLogger(verbose)
logger := log.New(os.Stdout, "WEB: ", log.LstdFlags)
if !verbose {
logger.SetOutput(ioutil.Discard)
logger.SetOutput(io.Discard)
}
logger.Printf("Starting web scraping process with verbose mode: %v", verbose)
scraperConfig.Verbose = verbose
@@ -139,7 +139,7 @@ func writeSingleFile(content map[string]string) error {
defer file.Close()
for url, c := range content {
_, err = fmt.Fprintf(file, "# Content from %s\n\n%s\n\n---\n\n", url, c)
_, err = fmt.Fprintf(file, "# ::: Content from %s\n\n%s\n\n---\n\n", url, c)
if err != nil {
return fmt.Errorf("error writing content to file: %v", err)
}
@@ -161,7 +161,7 @@ func writeMultipleFiles(content map[string]string) error {
return fmt.Errorf("error creating output file %s: %v", filename, err)
}
_, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s\n", url, c))
_, err = file.WriteString(fmt.Sprintf("# ::: Content from %s\n\n%s\n", url, c))
if err != nil {
file.Close()
return fmt.Errorf("error writing content to file %s: %v", filename, err)
@@ -215,8 +215,10 @@ func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error
return content, nil
}
var testExtractAndConvertContent = extractAndConvertContent
var testExtractLinks = scraper.ExtractLinks
var (
testExtractAndConvertContent = extractAndConvertContent
testExtractLinks = scraper.ExtractLinks
)
func extractAndConvertContent(urlStr string) (string, error) {
content, err := scraper.FetchWebpageContent(urlStr)
@@ -240,7 +242,7 @@ func extractAndConvertContent(urlStr string) (string, error) {
if err != nil {
return "", fmt.Errorf("error parsing URL: %v", err)
}
header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
header := fmt.Sprintf("# ::: Content from %s\n\n", parsedURL.String())
return header + markdown + "\n\n", nil
}

View File

@@ -103,7 +103,7 @@ func mockExtractAndConvertContent(urlStr string) (string, error) {
return "Mocked content for " + urlStr, nil
}
func mockExtractLinks(urlStr string) ([]string, error) {
func mockExtractLinks() ([]string, error) {
return []string{"http://example.com/link1", "http://example.com/link2"}, nil
}

View File

@@ -45,10 +45,10 @@ scrape:
}
defer os.Remove(tmpfile.Name())
if _, err := tmpfile.Write(content); err != nil {
if _, err = tmpfile.Write(content); err != nil {
t.Fatalf("Failed to write to temp file: %v", err)
}
if err := tmpfile.Close(); err != nil {
if err = tmpfile.Close(); err != nil {
t.Fatalf("Failed to close temp file: %v", err)
}

View File

@@ -1,21 +1,21 @@
package scraper
import (
"context"
"fmt"
"io/ioutil"
"io"
"log"
"math/rand"
"net/url"
"os"
"regexp"
"strings"
"time"
"sync"
"context"
"time"
md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/PuerkitoBio/goquery"
"github.com/playwright-community/playwright-go"
md "github.com/JohannesKaufmann/html-to-markdown"
"golang.org/x/time/rate"
)
@@ -81,7 +81,7 @@ func ScrapeSites(config Config) (map[string]string, error) {
fullURL := site.BaseURL + path
totalURLs++
logger.Printf("Queueing URL for scraping: %s\n", fullURL)
scrapeSingleURL(fullURL, site, config, results, limiter)
scrapeSingleURL(fullURL, site, results, limiter)
}
}(site)
}
@@ -108,11 +108,12 @@ func ScrapeSites(config Config) (map[string]string, error) {
return scrapedContent, nil
}
func scrapeSingleURL(url string, site SiteConfig, config Config, results chan<- struct {
func scrapeSingleURL(url string, site SiteConfig, results chan<- struct {
url string
content string
err error
}, limiter *rate.Limiter) {
}, limiter *rate.Limiter,
) {
logger.Printf("Starting to scrape URL: %s\n", url)
// Wait for rate limiter before making the request
@@ -155,11 +156,12 @@ func scrapeSingleURL(url string, site SiteConfig, config Config, results chan<-
}{url, content, nil}
}
func scrapeSite(site SiteConfig, config Config, results chan<- struct {
func scrapeSite(site SiteConfig, results chan<- struct {
url string
content string
err error
}, limiter *rate.Limiter) {
}, limiter *rate.Limiter,
) {
visited := make(map[string]bool)
queue := []string{site.BaseURL}
@@ -296,7 +298,7 @@ func SetupLogger(verbose bool) {
if verbose {
logger = log.New(os.Stdout, "SCRAPER: ", log.LstdFlags)
} else {
logger = log.New(ioutil.Discard, "", 0)
logger = log.New(io.Discard, "", 0)
}
}
@@ -387,7 +389,9 @@ func FetchWebpageContent(urlStr string) (string, error) {
}
logger.Println("Waiting for body element")
_, err = page.WaitForSelector("body", playwright.PageWaitForSelectorOptions{
bodyElement := page.Locator("body")
err = bodyElement.WaitFor(playwright.LocatorWaitForOptions{
State: playwright.WaitForSelectorStateVisible,
})
if err != nil {
@@ -404,7 +408,7 @@ func FetchWebpageContent(urlStr string) (string, error) {
if content == "" {
logger.Println(" content is empty, falling back to body content")
content, err = page.InnerHTML("body")
content, err = bodyElement.InnerHTML()
if err != nil {
logger.Printf("Error getting body content: %v\n", err)
return "", fmt.Errorf("could not get body content: %v", err)
@@ -457,6 +461,8 @@ func scrollPage(page playwright.Page) error {
() => {
window.scrollTo(0, document.body.scrollHeight);
return document.body.scrollHeight;
// wait for 500 ms
new Promise(resolve => setTimeout(resolve, 500));
}
`
@@ -488,7 +494,9 @@ func scrollPage(page playwright.Page) error {
previousHeight = currentHeight
page.WaitForTimeout(500)
// Wait for a while before scrolling again
}
logger.Println("Scrolling back to top")

View File

@@ -1,13 +1,13 @@
package scraper
import (
"testing"
"io"
"log"
"net/http"
"net/http/httptest"
"strings"
"reflect"
"log"
"io/ioutil"
"strings"
"testing"
)
func TestIsAllowedURL(t *testing.T) {
@@ -73,7 +73,7 @@ func TestGetOverrides(t *testing.T) {
func TestExtractContentWithCSS(t *testing.T) {
// Initialize logger for testing
logger = log.New(ioutil.Discard, "", 0)
logger = log.New(io.Discard, "", 0)
html := `
<html>