mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 15:03:17 +00:00
Compare commits
24 Commits
v0.0.1
...
refactor-2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eaaa6449b4 | ||
|
|
318951063a | ||
|
|
02e39baf38 | ||
| 333b9a366c | |||
|
|
1869dae89a | ||
|
|
d3ff7cb862 | ||
|
|
ea410e4abb | ||
|
|
7d8e25b1ad | ||
|
|
691832e282 | ||
|
|
31e0fa5ea4 | ||
|
|
71f63ddaa8 | ||
|
|
574800c241 | ||
|
|
d5a94f5468 | ||
|
|
59994c085c | ||
|
|
396f092d50 | ||
|
|
274ef7ea79 | ||
|
|
a55e8df02a | ||
|
|
364b185269 | ||
|
|
952c2dda02 | ||
|
|
de84d68b4c | ||
|
|
e5d4c514a7 | ||
|
|
6ff44f81bb | ||
|
|
2fd411ce65 | ||
|
|
73116e8d82 |
68
README.md
68
README.md
@@ -4,16 +4,18 @@ Rollup aggregates the contents of text-based files and webpages into a markdown
|
|||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
- File type filtering
|
- File type filtering for targeted content aggregation
|
||||||
- Ignore patterns for excluding files
|
- Ignore patterns for excluding specific files or directories
|
||||||
- Support for code-generated file detection
|
- Support for code-generated file detection and exclusion
|
||||||
- Advanced web scraping functionality
|
- Advanced web scraping functionality with depth control
|
||||||
- Verbose logging option for detailed output
|
- Verbose logging option for detailed operation insights
|
||||||
- Exclusionary CSS selectors for web scraping
|
- Exclusionary CSS selectors for precise web content extraction
|
||||||
- Support for multiple URLs in web scraping
|
- Support for multiple URLs in web scraping operations
|
||||||
- Configurable output format for web scraping (single file or separate files)
|
- Configurable output format for web scraping (single file or separate files)
|
||||||
- Configuration file support (YAML)
|
- Flexible configuration file support (YAML)
|
||||||
- Generation of default configuration file
|
- Automatic generation of default configuration file
|
||||||
|
- Custom output file naming
|
||||||
|
- Rate limiting for web scraping to respect server resources
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
@@ -64,24 +66,36 @@ Rollup can be configured using a YAML file. By default, it looks for `rollup.yml
|
|||||||
Example `rollup.yml`:
|
Example `rollup.yml`:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
file_types:
|
file_extensions:
|
||||||
- go
|
- go
|
||||||
- md
|
- md
|
||||||
ignore:
|
ignore_paths:
|
||||||
- node_modules/**
|
- node_modules/**
|
||||||
- vendor/**
|
- vendor/**
|
||||||
- .git/**
|
- .git/**
|
||||||
code_generated:
|
code_generated_paths:
|
||||||
- **/generated/**
|
- **/generated/**
|
||||||
scrape:
|
sites:
|
||||||
urls:
|
- base_url: https://example.com
|
||||||
- url: https://example.com
|
css_locator: .content
|
||||||
css_locator: .content
|
exclude_selectors:
|
||||||
exclude_selectors:
|
- .ads
|
||||||
- .ads
|
- .navigation
|
||||||
- .navigation
|
max_depth: 2
|
||||||
output_alias: example
|
allowed_paths:
|
||||||
output_type: single
|
- /blog
|
||||||
|
- /docs
|
||||||
|
exclude_paths:
|
||||||
|
- /admin
|
||||||
|
output_alias: example
|
||||||
|
path_overrides:
|
||||||
|
- path: /special-page
|
||||||
|
css_locator: .special-content
|
||||||
|
exclude_selectors:
|
||||||
|
- .special-ads
|
||||||
|
output_type: single
|
||||||
|
requests_per_second: 1.0
|
||||||
|
burst_limit: 3
|
||||||
```
|
```
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
@@ -111,10 +125,22 @@ scrape:
|
|||||||
```
|
```
|
||||||
|
|
||||||
5. Web scraping with separate output files:
|
5. Web scraping with separate output files:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
rollup web --urls=https://example.com,https://another-example.com --output=separate
|
rollup web --urls=https://example.com,https://another-example.com --output=separate
|
||||||
```
|
```
|
||||||
|
|
||||||
|
6. Rollup files with specific types and ignore patterns:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rollup files --types=go,md --ignore=vendor/**,*_test.go
|
||||||
|
```
|
||||||
|
|
||||||
|
7. Web scraping with depth and CSS selector:
|
||||||
|
```bash
|
||||||
|
rollup web --urls=https://example.com --depth=2 --css=.main-content
|
||||||
|
```
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
Contributions are welcome! Please feel free to submit a Pull Request.
|
Contributions are welcome! Please feel free to submit a Pull Request.
|
||||||
|
|||||||
186
cmd/files.go
186
cmd/files.go
@@ -1,186 +0,0 @@
|
|||||||
package cmd
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
path string
|
|
||||||
fileTypes string
|
|
||||||
codeGenPatterns string
|
|
||||||
ignorePatterns string
|
|
||||||
)
|
|
||||||
|
|
||||||
var filesCmd = &cobra.Command{
|
|
||||||
Use: "files",
|
|
||||||
Short: "Rollup files into a single Markdown file",
|
|
||||||
Long: `The files subcommand writes the contents of all files (with target custom file types provided)
|
|
||||||
in a given project, current path or a custom path, to a single timestamped markdown file
|
|
||||||
whose name is <project-directory-name>-rollup-<timestamp>.md.`,
|
|
||||||
RunE: func(cmd *cobra.Command, args []string) error {
|
|
||||||
return runRollup()
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
filesCmd.Flags().StringVarP(&path, "path", "p", ".", "Path to the project directory")
|
|
||||||
filesCmd.Flags().StringVarP(&fileTypes, "types", "t", ".go,.md,.txt", "Comma-separated list of file extensions to include")
|
|
||||||
filesCmd.Flags().StringVarP(&codeGenPatterns, "codegen", "g", "", "Comma-separated list of glob patterns for code-generated files")
|
|
||||||
filesCmd.Flags().StringVarP(&ignorePatterns, "ignore", "i", "", "Comma-separated list of glob patterns for files to ignore")
|
|
||||||
}
|
|
||||||
|
|
||||||
func matchGlob(pattern, path string) bool {
|
|
||||||
parts := strings.Split(pattern, "/")
|
|
||||||
return matchGlobRecursive(parts, path)
|
|
||||||
}
|
|
||||||
|
|
||||||
func matchGlobRecursive(patternParts []string, path string) bool {
|
|
||||||
if len(patternParts) == 0 {
|
|
||||||
return path == ""
|
|
||||||
}
|
|
||||||
|
|
||||||
if patternParts[0] == "**" {
|
|
||||||
for i := 0; i <= len(path); i++ {
|
|
||||||
if matchGlobRecursive(patternParts[1:], path[i:]) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
i := strings.IndexByte(path, '/')
|
|
||||||
if i < 0 {
|
|
||||||
matched, _ := filepath.Match(patternParts[0], path)
|
|
||||||
return matched && len(patternParts) == 1
|
|
||||||
}
|
|
||||||
|
|
||||||
matched, _ := filepath.Match(patternParts[0], path[:i])
|
|
||||||
return matched && matchGlobRecursive(patternParts[1:], path[i+1:])
|
|
||||||
}
|
|
||||||
|
|
||||||
func isCodeGenerated(filePath string, patterns []string) bool {
|
|
||||||
for _, pattern := range patterns {
|
|
||||||
if strings.Contains(pattern, "**") {
|
|
||||||
if matchGlob(pattern, filePath) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
matched, err := filepath.Match(pattern, filepath.Base(filePath))
|
|
||||||
if err == nil && matched {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func isIgnored(filePath string, patterns []string) bool {
|
|
||||||
for _, pattern := range patterns {
|
|
||||||
if strings.Contains(pattern, "**") {
|
|
||||||
if matchGlob(pattern, filePath) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
matched, err := filepath.Match(pattern, filepath.Base(filePath))
|
|
||||||
if err == nil && matched {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func runRollup() error {
|
|
||||||
// Use config if available, otherwise use command-line flags
|
|
||||||
var types, codeGenList, ignoreList []string
|
|
||||||
if cfg != nil && len(cfg.FileTypes) > 0 {
|
|
||||||
types = cfg.FileTypes
|
|
||||||
} else {
|
|
||||||
types = strings.Split(fileTypes, ",")
|
|
||||||
}
|
|
||||||
if cfg != nil && len(cfg.CodeGenerated) > 0 {
|
|
||||||
codeGenList = cfg.CodeGenerated
|
|
||||||
} else {
|
|
||||||
codeGenList = strings.Split(codeGenPatterns, ",")
|
|
||||||
}
|
|
||||||
if cfg != nil && cfg.Ignore != nil && len(cfg.Ignore) > 0 {
|
|
||||||
ignoreList = cfg.Ignore
|
|
||||||
} else {
|
|
||||||
ignoreList = strings.Split(ignorePatterns, ",")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the absolute path
|
|
||||||
absPath, err := filepath.Abs(path)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error getting absolute path: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the project directory name
|
|
||||||
projectName := filepath.Base(absPath)
|
|
||||||
|
|
||||||
// Generate the output file name
|
|
||||||
timestamp := time.Now().Format("20060102-150405")
|
|
||||||
outputFileName := fmt.Sprintf("%s-%s.rollup.md", projectName, timestamp)
|
|
||||||
|
|
||||||
// Open the output file
|
|
||||||
outputFile, err := os.Create(outputFileName)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error creating output file: %v", err)
|
|
||||||
}
|
|
||||||
defer outputFile.Close()
|
|
||||||
|
|
||||||
// Walk through the directory
|
|
||||||
err = filepath.Walk(absPath, func(path string, info os.FileInfo, err error) error {
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if info.IsDir() {
|
|
||||||
if strings.HasPrefix(info.Name(), ".") {
|
|
||||||
return filepath.SkipDir
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
relPath, _ := filepath.Rel(absPath, path)
|
|
||||||
|
|
||||||
// Check if the file should be ignored
|
|
||||||
if isIgnored(relPath, ignoreList) {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
ext := filepath.Ext(path)
|
|
||||||
for _, t := range types {
|
|
||||||
if ext == "."+t {
|
|
||||||
// Read file contents
|
|
||||||
content, err := os.ReadFile(path)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Printf("Error reading file %s: %v", path, err)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if the file is code-generated
|
|
||||||
isCodeGen := isCodeGenerated(relPath, codeGenList)
|
|
||||||
codeGenNote := ""
|
|
||||||
if isCodeGen {
|
|
||||||
codeGenNote = " (Code-generated, Read-only)"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write file name and contents to the output file
|
|
||||||
fmt.Fprintf(outputFile, "# File: %s%s\n\n```%s\n%s```\n\n", relPath, codeGenNote, t, string(content))
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error walking through directory: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Printf("Rollup complete. Output file: %s", outputFileName)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
package cmd
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"sort"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
|
||||||
"github.com/tnypxl/rollup/internal/config"
|
|
||||||
"gopkg.in/yaml.v2"
|
|
||||||
)
|
|
||||||
|
|
||||||
var generateCmd = &cobra.Command{
|
|
||||||
Use: "generate",
|
|
||||||
Short: "Generate a rollup.yml config file",
|
|
||||||
Long: `Scan the current directory for text and code files and generate a rollup.yml config file based on the found file extensions.`,
|
|
||||||
RunE: runGenerate,
|
|
||||||
}
|
|
||||||
|
|
||||||
func runGenerate(cmd *cobra.Command, args []string) error {
|
|
||||||
fileTypes := make(map[string]bool)
|
|
||||||
err := filepath.Walk(".", func(path string, info os.FileInfo, err error) error {
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if !info.IsDir() {
|
|
||||||
ext := strings.TrimPrefix(filepath.Ext(path), ".")
|
|
||||||
if isTextFile(ext) {
|
|
||||||
fileTypes[ext] = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error walking the path: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
cfg := config.Config{
|
|
||||||
FileTypes: make([]string, 0, len(fileTypes)),
|
|
||||||
Ignore: []string{"node_modules/**", "vendor/**", ".git/**"},
|
|
||||||
}
|
|
||||||
|
|
||||||
for ext := range fileTypes {
|
|
||||||
cfg.FileTypes = append(cfg.FileTypes, ext)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort file types for consistency
|
|
||||||
sort.Strings(cfg.FileTypes)
|
|
||||||
|
|
||||||
yamlData, err := yaml.Marshal(&cfg)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error marshaling config: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
outputPath := config.DefaultConfigPath()
|
|
||||||
err = os.WriteFile(outputPath, yamlData, 0644)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error writing config file: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Printf("Generated %s file successfully.\n", outputPath)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func isTextFile(ext string) bool {
|
|
||||||
textExtensions := map[string]bool{
|
|
||||||
"txt": true, "md": true, "go": true, "py": true, "js": true, "html": true, "css": true,
|
|
||||||
"json": true, "xml": true, "yaml": true, "yml": true, "toml": true, "ini": true,
|
|
||||||
"sh": true, "bash": true, "zsh": true, "fish": true,
|
|
||||||
"c": true, "cpp": true, "h": true, "hpp": true, "java": true, "kt": true, "scala": true,
|
|
||||||
"rs": true, "rb": true, "php": true, "ts": true, "swift": true,
|
|
||||||
}
|
|
||||||
return textExtensions[ext]
|
|
||||||
}
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
// Add any flags for the generate command here if needed
|
|
||||||
}
|
|
||||||
36
cmd/root.go
36
cmd/root.go
@@ -1,36 +0,0 @@
|
|||||||
package cmd
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/spf13/cobra"
|
|
||||||
config "github.com/tnypxl/rollup/internal/config"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
configFile string
|
|
||||||
cfg *config.Config
|
|
||||||
verbose bool
|
|
||||||
)
|
|
||||||
|
|
||||||
var rootCmd = &cobra.Command{
|
|
||||||
Use: "rollup",
|
|
||||||
Short: "Rollup is a tool for combining and processing files",
|
|
||||||
Long: `Rollup is a versatile tool that can combine and process files in various ways.
|
|
||||||
Use subcommands to perform specific operations.`,
|
|
||||||
}
|
|
||||||
|
|
||||||
func Execute(conf *config.Config) error {
|
|
||||||
cfg = conf
|
|
||||||
if cfg == nil {
|
|
||||||
cfg = &config.Config{} // Use an empty config if none is provided
|
|
||||||
}
|
|
||||||
return rootCmd.Execute()
|
|
||||||
}
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
rootCmd.PersistentFlags().StringVarP(&configFile, "config", "f", "", "Path to the config file (default: rollup.yml in the current directory)")
|
|
||||||
rootCmd.PersistentFlags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose logging")
|
|
||||||
|
|
||||||
rootCmd.AddCommand(filesCmd)
|
|
||||||
rootCmd.AddCommand(webCmd)
|
|
||||||
rootCmd.AddCommand(generateCmd)
|
|
||||||
}
|
|
||||||
217
cmd/web.go
217
cmd/web.go
@@ -1,217 +0,0 @@
|
|||||||
package cmd
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"net/url"
|
|
||||||
"os"
|
|
||||||
"regexp"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
|
||||||
"github.com/tnypxl/rollup/internal/scraper"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
urls []string
|
|
||||||
outputType string
|
|
||||||
depth int
|
|
||||||
includeSelector string
|
|
||||||
excludeSelectors []string
|
|
||||||
)
|
|
||||||
|
|
||||||
var scraperConfig scraper.Config
|
|
||||||
|
|
||||||
var webCmd = &cobra.Command{
|
|
||||||
Use: "web",
|
|
||||||
Short: "Scrape main content from webpages and convert to Markdown",
|
|
||||||
Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
|
|
||||||
RunE: runWeb,
|
|
||||||
}
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
|
||||||
webCmd.Flags().StringVarP(&outputType, "output", "o", "single", "Output type: 'single' for one file, 'separate' for multiple files")
|
|
||||||
webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)")
|
|
||||||
webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content")
|
|
||||||
webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)")
|
|
||||||
}
|
|
||||||
|
|
||||||
func runWeb(cmd *cobra.Command, args []string) error {
|
|
||||||
scraperConfig.Verbose = verbose
|
|
||||||
|
|
||||||
// Use config if available, otherwise use command-line flags
|
|
||||||
var urlConfigs []scraper.URLConfig
|
|
||||||
if len(urls) == 0 && len(cfg.Scrape.URLs) > 0 {
|
|
||||||
urlConfigs = make([]scraper.URLConfig, len(cfg.Scrape.URLs))
|
|
||||||
for i, u := range cfg.Scrape.URLs {
|
|
||||||
urlConfigs[i] = scraper.URLConfig{
|
|
||||||
URL: u.URL,
|
|
||||||
CSSLocator: u.CSSLocator,
|
|
||||||
ExcludeSelectors: u.ExcludeSelectors,
|
|
||||||
OutputAlias: u.OutputAlias,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
urlConfigs = make([]scraper.URLConfig, len(urls))
|
|
||||||
for i, u := range urls {
|
|
||||||
urlConfigs[i] = scraper.URLConfig{URL: u, CSSLocator: includeSelector}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(urlConfigs) == 0 {
|
|
||||||
return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.urls' in the rollup.yml file")
|
|
||||||
}
|
|
||||||
|
|
||||||
scraperConfig := scraper.Config{
|
|
||||||
URLs: urlConfigs,
|
|
||||||
OutputType: outputType,
|
|
||||||
Verbose: verbose,
|
|
||||||
}
|
|
||||||
|
|
||||||
scrapedContent, err := scraper.ScrapeMultipleURLs(scraperConfig)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error scraping content: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if outputType == "single" {
|
|
||||||
return writeSingleFile(scrapedContent)
|
|
||||||
} else {
|
|
||||||
return writeMultipleFiles(scrapedContent)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func writeSingleFile(content map[string]string) error {
|
|
||||||
outputFile := generateDefaultFilename()
|
|
||||||
file, err := os.Create(outputFile)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error creating output file: %v", err)
|
|
||||||
}
|
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
for url, c := range content {
|
|
||||||
_, err = fmt.Fprintf(file, "# Content from %s\n\n%s\n\n---\n\n", url, c)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error writing content to file: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func writeMultipleFiles(content map[string]string) error {
|
|
||||||
for url, c := range content {
|
|
||||||
filename := getFilenameFromContent(c, url)
|
|
||||||
file, err := os.Create(filename)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error creating output file %s: %v", filename, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = fmt.Fprintf(file, "# Content from %s\n\n%s", url, c)
|
|
||||||
file.Close()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error writing content to file %s: %v", filename, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Printf("Content from %s has been saved to %s\n", url, filename)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func generateDefaultFilename() string {
|
|
||||||
timestamp := time.Now().Format("20060102-150405")
|
|
||||||
return fmt.Sprintf("web-%s.rollup.md", timestamp)
|
|
||||||
}
|
|
||||||
|
|
||||||
func scrapeRecursively(urlStr string, depth int) (string, error) {
|
|
||||||
visited := make(map[string]bool)
|
|
||||||
return scrapeURL(urlStr, depth, visited)
|
|
||||||
}
|
|
||||||
|
|
||||||
func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error) {
|
|
||||||
if depth < 0 || visited[urlStr] {
|
|
||||||
return "", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
visited[urlStr] = true
|
|
||||||
|
|
||||||
content, err := extractAndConvertContent(urlStr)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
if depth > 0 {
|
|
||||||
links, err := scraper.ExtractLinks(urlStr)
|
|
||||||
if err != nil {
|
|
||||||
return content, fmt.Errorf("error extracting links: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, link := range links {
|
|
||||||
subContent, err := scrapeURL(link, depth-1, visited)
|
|
||||||
if err != nil {
|
|
||||||
fmt.Printf("Warning: Error scraping %s: %v\n", link, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
content += "\n\n---\n\n" + subContent
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return content, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func extractAndConvertContent(urlStr string) (string, error) {
|
|
||||||
content, err := scraper.FetchWebpageContent(urlStr)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error fetching webpage content: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if includeSelector != "" {
|
|
||||||
content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error extracting content with CSS: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
markdown, err := scraper.ProcessHTMLContent(content, scraper.Config{})
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error processing HTML content: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
parsedURL, err := url.Parse(urlStr)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error parsing URL: %v", err)
|
|
||||||
}
|
|
||||||
header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
|
|
||||||
|
|
||||||
return header + markdown + "\n\n", nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func getFilenameFromContent(content, url string) string {
|
|
||||||
// Try to extract title from content
|
|
||||||
titleStart := strings.Index(content, "<title>")
|
|
||||||
titleEnd := strings.Index(content, "</title>")
|
|
||||||
if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart {
|
|
||||||
title := content[titleStart+7 : titleEnd]
|
|
||||||
return sanitizeFilename(title) + ".md"
|
|
||||||
}
|
|
||||||
|
|
||||||
// If no title found, use the URL
|
|
||||||
return sanitizeFilename(url) + ".md"
|
|
||||||
}
|
|
||||||
|
|
||||||
func sanitizeFilename(name string) string {
|
|
||||||
// Remove any character that isn't alphanumeric, dash, or underscore
|
|
||||||
reg := regexp.MustCompile("[^a-zA-Z0-9-_]+")
|
|
||||||
name = reg.ReplaceAllString(name, "_")
|
|
||||||
|
|
||||||
// Trim any leading or trailing underscores
|
|
||||||
name = strings.Trim(name, "_")
|
|
||||||
|
|
||||||
// If the name is empty after sanitization, use a default name
|
|
||||||
if name == "" {
|
|
||||||
name = "untitled"
|
|
||||||
}
|
|
||||||
|
|
||||||
return name
|
|
||||||
}
|
|
||||||
1
go.mod
1
go.mod
@@ -5,6 +5,7 @@ go 1.23
|
|||||||
require (
|
require (
|
||||||
github.com/JohannesKaufmann/html-to-markdown v1.6.0
|
github.com/JohannesKaufmann/html-to-markdown v1.6.0
|
||||||
github.com/spf13/cobra v1.8.1
|
github.com/spf13/cobra v1.8.1
|
||||||
|
golang.org/x/time v0.6.0
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
|||||||
2
go.sum
2
go.sum
@@ -102,6 +102,8 @@ golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
|||||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||||
|
golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U=
|
||||||
|
golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
|
||||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||||
|
|||||||
@@ -1,52 +0,0 @@
|
|||||||
package config
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
|
|
||||||
"gopkg.in/yaml.v2"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Config struct {
|
|
||||||
FileTypes []string `yaml:"file_types"`
|
|
||||||
Ignore []string `yaml:"ignore"`
|
|
||||||
CodeGenerated []string `yaml:"code_generated"`
|
|
||||||
Scrape ScrapeConfig `yaml:"scrape"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type ScrapeConfig struct {
|
|
||||||
URLs []URLConfig `yaml:"urls"`
|
|
||||||
OutputType string `yaml:"output_type"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type URLConfig struct {
|
|
||||||
URL string `yaml:"url"`
|
|
||||||
CSSLocator string `yaml:"css_locator"`
|
|
||||||
ExcludeSelectors []string `yaml:"exclude_selectors"`
|
|
||||||
OutputAlias string `yaml:"output_alias"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func Load(configPath string) (*Config, error) {
|
|
||||||
data, err := os.ReadFile(configPath)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error reading config file: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
var config Config
|
|
||||||
err = yaml.Unmarshal(data, &config)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error parsing config file: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return &config, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func DefaultConfigPath() string {
|
|
||||||
return "rollup.yml"
|
|
||||||
}
|
|
||||||
|
|
||||||
func FileExists(filename string) bool {
|
|
||||||
_, err := os.Stat(filename)
|
|
||||||
return err == nil
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1,369 +0,0 @@
|
|||||||
package scraper
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"io/ioutil"
|
|
||||||
"log"
|
|
||||||
"math/rand"
|
|
||||||
"regexp"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
|
||||||
"github.com/playwright-community/playwright-go"
|
|
||||||
md "github.com/JohannesKaufmann/html-to-markdown"
|
|
||||||
)
|
|
||||||
|
|
||||||
var logger *log.Logger
|
|
||||||
|
|
||||||
var (
|
|
||||||
pw *playwright.Playwright
|
|
||||||
browser playwright.Browser
|
|
||||||
)
|
|
||||||
|
|
||||||
// Config holds the scraper configuration
|
|
||||||
type Config struct {
|
|
||||||
URLs []URLConfig
|
|
||||||
OutputType string
|
|
||||||
Verbose bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// ScrapeMultipleURLs scrapes multiple URLs concurrently
|
|
||||||
func ScrapeMultipleURLs(config Config) (map[string]string, error) {
|
|
||||||
results := make(chan struct {
|
|
||||||
url string
|
|
||||||
content string
|
|
||||||
err error
|
|
||||||
}, len(config.URLs))
|
|
||||||
|
|
||||||
for _, urlConfig := range config.URLs {
|
|
||||||
go func(cfg URLConfig) {
|
|
||||||
content, err := scrapeURL(cfg)
|
|
||||||
results <- struct {
|
|
||||||
url string
|
|
||||||
content string
|
|
||||||
err error
|
|
||||||
}{cfg.URL, content, err}
|
|
||||||
}(urlConfig)
|
|
||||||
}
|
|
||||||
|
|
||||||
scrapedContent := make(map[string]string)
|
|
||||||
for i := 0; i < len(config.URLs); i++ {
|
|
||||||
result := <-results
|
|
||||||
if result.err != nil {
|
|
||||||
logger.Printf("Error scraping %s: %v\n", result.url, result.err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
scrapedContent[result.url] = result.content
|
|
||||||
}
|
|
||||||
|
|
||||||
return scrapedContent, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func scrapeURL(config URLConfig) (string, error) {
|
|
||||||
content, err := FetchWebpageContent(config.URL)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
if config.CSSLocator != "" {
|
|
||||||
content, err = ExtractContentWithCSS(content, config.CSSLocator, config.ExcludeSelectors)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return ProcessHTMLContent(content, Config{})
|
|
||||||
}
|
|
||||||
|
|
||||||
func getFilenameFromContent(content, url string) string {
|
|
||||||
// Try to extract title from content
|
|
||||||
titleStart := strings.Index(content, "<title>")
|
|
||||||
titleEnd := strings.Index(content, "</title>")
|
|
||||||
if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart {
|
|
||||||
title := content[titleStart+7 : titleEnd]
|
|
||||||
return sanitizeFilename(title) + ".md"
|
|
||||||
}
|
|
||||||
|
|
||||||
// If no title found, use the URL
|
|
||||||
return sanitizeFilename(url) + ".md"
|
|
||||||
}
|
|
||||||
|
|
||||||
func sanitizeFilename(name string) string {
|
|
||||||
// Remove any character that isn't alphanumeric, dash, or underscore
|
|
||||||
reg, _ := regexp.Compile("[^a-zA-Z0-9-_]+")
|
|
||||||
return reg.ReplaceAllString(name, "_")
|
|
||||||
}
|
|
||||||
|
|
||||||
// URLConfig holds configuration for a single URL
|
|
||||||
type URLConfig struct {
|
|
||||||
URL string
|
|
||||||
CSSLocator string
|
|
||||||
ExcludeSelectors []string
|
|
||||||
OutputAlias string
|
|
||||||
}
|
|
||||||
|
|
||||||
// SetupLogger initializes the logger based on the verbose flag
|
|
||||||
func SetupLogger(verbose bool) {
|
|
||||||
if verbose {
|
|
||||||
logger = log.New(log.Writer(), "SCRAPER: ", log.LstdFlags)
|
|
||||||
} else {
|
|
||||||
logger = log.New(ioutil.Discard, "", 0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// InitPlaywright initializes Playwright and launches the browser
|
|
||||||
func InitPlaywright() error {
|
|
||||||
logger.Println("Initializing Playwright")
|
|
||||||
var err error
|
|
||||||
|
|
||||||
// Install Playwright and Chromium browser
|
|
||||||
err = playwright.Install(&playwright.RunOptions{Browsers: []string{"chromium"}})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not install Playwright and Chromium: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
pw, err = playwright.Run()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not start Playwright: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
userAgent := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
||||||
|
|
||||||
browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
|
|
||||||
Args: []string{fmt.Sprintf("--user-agent=%s", userAgent)},
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("could not launch browser: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Println("Playwright initialized successfully")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ClosePlaywright closes the browser and stops Playwright
|
|
||||||
func ClosePlaywright() {
|
|
||||||
if browser != nil {
|
|
||||||
browser.Close()
|
|
||||||
}
|
|
||||||
if pw != nil {
|
|
||||||
pw.Stop()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// FetchWebpageContent retrieves the content of a webpage using Playwright
|
|
||||||
func FetchWebpageContent(urlStr string) (string, error) {
|
|
||||||
logger.Printf("Fetching webpage content for URL: %s\n", urlStr)
|
|
||||||
|
|
||||||
page, err := browser.NewPage()
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error creating new page: %v\n", err)
|
|
||||||
return "", fmt.Errorf("could not create page: %v", err)
|
|
||||||
}
|
|
||||||
defer page.Close()
|
|
||||||
|
|
||||||
time.Sleep(time.Duration(rand.Intn(2000)+1000) * time.Millisecond)
|
|
||||||
|
|
||||||
logger.Printf("Navigating to URL: %s\n", urlStr)
|
|
||||||
if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
|
|
||||||
WaitUntil: playwright.WaitUntilStateNetworkidle,
|
|
||||||
}); err != nil {
|
|
||||||
logger.Printf("Error navigating to page: %v\n", err)
|
|
||||||
return "", fmt.Errorf("could not go to page: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Println("Waiting for page load state")
|
|
||||||
err = page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
|
||||||
State: playwright.LoadStateNetworkidle,
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error waiting for page load: %v\n", err)
|
|
||||||
return "", fmt.Errorf("error waiting for page load: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Println("Scrolling page")
|
|
||||||
err = scrollPage(page)
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error scrolling page: %v\n", err)
|
|
||||||
return "", fmt.Errorf("error scrolling page: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Println("Waiting for body element")
|
|
||||||
_, err = page.WaitForSelector("body", playwright.PageWaitForSelectorOptions{
|
|
||||||
State: playwright.WaitForSelectorStateVisible,
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error waiting for body: %v\n", err)
|
|
||||||
return "", fmt.Errorf("error waiting for body: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Println("Getting page content")
|
|
||||||
content, err := page.Content()
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error getting page content: %v\n", err)
|
|
||||||
return "", fmt.Errorf("could not get page content: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if content == "" {
|
|
||||||
logger.Println(" content is empty, falling back to body content")
|
|
||||||
content, err = page.InnerHTML("body")
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error getting body content: %v\n", err)
|
|
||||||
return "", fmt.Errorf("could not get body content: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Printf("Successfully fetched webpage content (length: %d)\n", len(content))
|
|
||||||
return content, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ProcessHTMLContent converts HTML content to Markdown
|
|
||||||
func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
|
|
||||||
logger.Printf("Processing HTML content (length: %d)\n", len(htmlContent))
|
|
||||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error parsing HTML: %v\n", err)
|
|
||||||
return "", fmt.Errorf("error parsing HTML: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
selection := doc.Find("body")
|
|
||||||
logger.Println("Processing entire body")
|
|
||||||
|
|
||||||
if selection.Length() == 0 {
|
|
||||||
return "", fmt.Errorf("no content found in the document")
|
|
||||||
}
|
|
||||||
|
|
||||||
content, err := selection.Html()
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error extracting content: %v\n", err)
|
|
||||||
return "", fmt.Errorf("error extracting content: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a new converter
|
|
||||||
converter := md.NewConverter("", true, nil)
|
|
||||||
|
|
||||||
// Convert HTML to Markdown
|
|
||||||
markdown, err := converter.ConvertString(content)
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error converting HTML to Markdown: %v\n", err)
|
|
||||||
return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown))
|
|
||||||
return markdown, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func scrollPage(page playwright.Page) error {
|
|
||||||
logger.Println("Starting page scroll")
|
|
||||||
script := `
|
|
||||||
() => {
|
|
||||||
window.scrollTo(0, document.body.scrollHeight);
|
|
||||||
return document.body.scrollHeight;
|
|
||||||
}
|
|
||||||
`
|
|
||||||
|
|
||||||
previousHeight := 0
|
|
||||||
for i := 0; i < 250; i++ {
|
|
||||||
height, err := page.Evaluate(script)
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error scrolling (iteration %d): %v\n", i+1, err)
|
|
||||||
return fmt.Errorf("error scrolling: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
var currentHeight int
|
|
||||||
switch v := height.(type) {
|
|
||||||
case int:
|
|
||||||
currentHeight = v
|
|
||||||
case float64:
|
|
||||||
currentHeight = int(v)
|
|
||||||
default:
|
|
||||||
logger.Printf("Unexpected height type: %T\n", height)
|
|
||||||
return fmt.Errorf("unexpected height type: %T", height)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Printf("Scroll iteration %d: height = %d\n", i+1, currentHeight)
|
|
||||||
|
|
||||||
if currentHeight == previousHeight {
|
|
||||||
logger.Println("Reached bottom of the page")
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
previousHeight = currentHeight
|
|
||||||
|
|
||||||
page.WaitForTimeout(500)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Println("Scrolling back to top")
|
|
||||||
_, err := page.Evaluate(`() => { window.scrollTo(0, 0); }`)
|
|
||||||
if err != nil {
|
|
||||||
logger.Printf("Error scrolling back to top: %v\n", err)
|
|
||||||
return fmt.Errorf("error scrolling back to top: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Println("Page scroll completed")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExtractLinks extracts all links from the given URL
|
|
||||||
func ExtractLinks(urlStr string) ([]string, error) {
|
|
||||||
logger.Printf("Extracting links from URL: %s\n", urlStr)
|
|
||||||
|
|
||||||
page, err := browser.NewPage()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("could not create page: %v", err)
|
|
||||||
}
|
|
||||||
defer page.Close()
|
|
||||||
|
|
||||||
if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
|
|
||||||
WaitUntil: playwright.WaitUntilStateNetworkidle,
|
|
||||||
}); err != nil {
|
|
||||||
return nil, fmt.Errorf("could not go to page: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
links, err := page.Evaluate(`() => {
|
|
||||||
const anchors = document.querySelectorAll('a');
|
|
||||||
return Array.from(anchors).map(a => a.href);
|
|
||||||
}`)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("could not extract links: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
var result []string
|
|
||||||
for _, link := range links.([]interface{}) {
|
|
||||||
result = append(result, link.(string))
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Printf("Extracted %d links\n", len(result))
|
|
||||||
return result, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExtractContentWithCSS extracts content from HTML using a CSS selector
|
|
||||||
func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) {
|
|
||||||
logger.Printf("Extracting content with CSS selector: %s\n", includeSelector)
|
|
||||||
|
|
||||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error parsing HTML: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
selection := doc.Find(includeSelector)
|
|
||||||
if selection.Length() == 0 {
|
|
||||||
logger.Printf("Warning: No content found with CSS selector: %s. Falling back to body content.\n", includeSelector)
|
|
||||||
selection = doc.Find("body")
|
|
||||||
if selection.Length() == 0 {
|
|
||||||
return "", fmt.Errorf("no content found in body")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, excludeSelector := range excludeSelectors {
|
|
||||||
selection.Find(excludeSelector).Remove()
|
|
||||||
}
|
|
||||||
|
|
||||||
selectedContent, err := selection.Html()
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Printf("Extracted content length: %d\n", len(selectedContent))
|
|
||||||
return selectedContent, nil
|
|
||||||
}
|
|
||||||
2
main.go
2
main.go
@@ -20,7 +20,7 @@ func main() {
|
|||||||
var err error
|
var err error
|
||||||
|
|
||||||
if !isHelpCommand {
|
if !isHelpCommand {
|
||||||
configPath := config.DefaultConfigPath()
|
configPath := "rollup.yml"
|
||||||
cfg, err = config.Load(configPath)
|
cfg, err = config.Load(configPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Warning: Failed to load configuration: %v", err)
|
log.Printf("Warning: Failed to load configuration: %v", err)
|
||||||
|
|||||||
Reference in New Issue
Block a user