package scraper import ( "io" "log" // "net/http" // "net/http/httptest" "reflect" "strings" "testing" ) func TestIsAllowedURL(t *testing.T) { site := SiteConfig{ BaseURL: "https://example.com", AllowedPaths: []string{"/blog", "/products"}, ExcludePaths: []string{"/admin", "/private"}, } tests := []struct { url string expected bool }{ {"https://example.com/blog/post1", true}, {"https://example.com/products/item1", true}, {"https://example.com/admin/dashboard", false}, {"https://example.com/private/data", false}, {"https://example.com/other/page", false}, {"https://othersite.com/blog/post1", false}, } for _, test := range tests { result := isAllowedURL(test.url, site) if result != test.expected { t.Errorf("isAllowedURL(%q) = %v, want %v", test.url, result, test.expected) } } } func TestGetOverrides(t *testing.T) { site := SiteConfig{ CSSLocator: "main", ExcludeSelectors: []string{".ads"}, PathOverrides: []PathOverride{ { Path: "/special", CSSLocator: ".special-content", ExcludeSelectors: []string{".sidebar"}, }, }, } tests := []struct { url string expectedLocator string expectedExcludes []string }{ {"https://example.com/normal", "main", []string{".ads"}}, {"https://example.com/special", ".special-content", []string{".sidebar"}}, {"https://example.com/special/page", ".special-content", []string{".sidebar"}}, } for _, test := range tests { locator, excludes := getOverrides(test.url, site) if locator != test.expectedLocator { t.Errorf("getOverrides(%q) locator = %q, want %q", test.url, locator, test.expectedLocator) } if !reflect.DeepEqual(excludes, test.expectedExcludes) { t.Errorf("getOverrides(%q) excludes = %v, want %v", test.url, excludes, test.expectedExcludes) } } } func TestExtractContentWithCSS(t *testing.T) { // Initialize logger for testing logger = log.New(io.Discard, "", 0) html := `

Main Content

This is the main content.

Advertisement
` tests := []struct { includeSelector string excludeSelectors []string expected string }{ {"main", nil, "

Main Content

\n

This is the main content.

\n
Advertisement
"}, {"main", []string{".ads"}, "

Main Content

\n

This is the main content.

"}, {"aside", nil, "Sidebar content"}, } for _, test := range tests { result, err := ExtractContentWithCSS(html, test.includeSelector, test.excludeSelectors) if err != nil { t.Errorf("ExtractContentWithCSS() returned error: %v", err) continue } if strings.TrimSpace(result) != strings.TrimSpace(test.expected) { t.Errorf("ExtractContentWithCSS() = %q, want %q", result, test.expected) } } } func TestProcessHTMLContent(t *testing.T) { html := `

Test Heading

This is a test paragraph.

` expected := strings.TrimSpace(` # Test Heading This is a **test** paragraph. - Item 1 - Item 2 `) result, err := ProcessHTMLContent(html, Config{}) if err != nil { t.Fatalf("ProcessHTMLContent() returned error: %v", err) } if strings.TrimSpace(result) != expected { t.Errorf("ProcessHTMLContent() = %q, want %q", result, expected) } } // func TestExtractLinks(t *testing.T) { // // Initialize Playwright before running the test // if err := InitPlaywright(); err != nil { // t.Fatalf("Failed to initialize Playwright: %v", err) // } // defer ClosePlaywright() // server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { // w.Header().Set("Content-Type", "text/html") // w.Write([]byte(` // // // Page 1 // Page 2 // Other Site // // // `)) // })) // defer server.Close() // links, err := ExtractLinks(server.URL) // if err != nil { // t.Fatalf("ExtractLinks() returned error: %v", err) // } // expectedLinks := []string{ // "https://example.com/page1", // "https://example.com/page2", // "https://othersite.com", // } // if !reflect.DeepEqual(links, expectedLinks) { // t.Errorf("ExtractLinks() = %v, want %v", links, expectedLinks) // } // }