mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-13 06:23:18 +00:00
Compare commits
27 Commits
fix-loggin
...
refactor-2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eaaa6449b4 | ||
|
|
318951063a | ||
|
|
02e39baf38 | ||
| 333b9a366c | |||
|
|
1869dae89a | ||
|
|
d3ff7cb862 | ||
|
|
ea410e4abb | ||
|
|
7d8e25b1ad | ||
|
|
691832e282 | ||
|
|
31e0fa5ea4 | ||
|
|
71f63ddaa8 | ||
|
|
574800c241 | ||
|
|
d5a94f5468 | ||
|
|
59994c085c | ||
|
|
396f092d50 | ||
|
|
274ef7ea79 | ||
|
|
a55e8df02a | ||
|
|
364b185269 | ||
|
|
952c2dda02 | ||
|
|
de84d68b4c | ||
|
|
e5d4c514a7 | ||
|
|
6ff44f81bb | ||
|
|
2fd411ce65 | ||
|
|
73116e8d82 | ||
| 5482621d99 | |||
| 3788a08b00 | |||
| 8ba54001ce |
68
README.md
68
README.md
@@ -4,16 +4,18 @@ Rollup aggregates the contents of text-based files and webpages into a markdown
|
||||
|
||||
## Features
|
||||
|
||||
- File type filtering
|
||||
- Ignore patterns for excluding files
|
||||
- Support for code-generated file detection
|
||||
- Advanced web scraping functionality
|
||||
- Verbose logging option for detailed output
|
||||
- Exclusionary CSS selectors for web scraping
|
||||
- Support for multiple URLs in web scraping
|
||||
- File type filtering for targeted content aggregation
|
||||
- Ignore patterns for excluding specific files or directories
|
||||
- Support for code-generated file detection and exclusion
|
||||
- Advanced web scraping functionality with depth control
|
||||
- Verbose logging option for detailed operation insights
|
||||
- Exclusionary CSS selectors for precise web content extraction
|
||||
- Support for multiple URLs in web scraping operations
|
||||
- Configurable output format for web scraping (single file or separate files)
|
||||
- Configuration file support (YAML)
|
||||
- Generation of default configuration file
|
||||
- Flexible configuration file support (YAML)
|
||||
- Automatic generation of default configuration file
|
||||
- Custom output file naming
|
||||
- Rate limiting for web scraping to respect server resources
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -64,24 +66,36 @@ Rollup can be configured using a YAML file. By default, it looks for `rollup.yml
|
||||
Example `rollup.yml`:
|
||||
|
||||
```yaml
|
||||
file_types:
|
||||
file_extensions:
|
||||
- go
|
||||
- md
|
||||
ignore:
|
||||
ignore_paths:
|
||||
- node_modules/**
|
||||
- vendor/**
|
||||
- .git/**
|
||||
code_generated:
|
||||
code_generated_paths:
|
||||
- **/generated/**
|
||||
scrape:
|
||||
urls:
|
||||
- url: https://example.com
|
||||
css_locator: .content
|
||||
exclude_selectors:
|
||||
- .ads
|
||||
- .navigation
|
||||
output_alias: example
|
||||
output_type: single
|
||||
sites:
|
||||
- base_url: https://example.com
|
||||
css_locator: .content
|
||||
exclude_selectors:
|
||||
- .ads
|
||||
- .navigation
|
||||
max_depth: 2
|
||||
allowed_paths:
|
||||
- /blog
|
||||
- /docs
|
||||
exclude_paths:
|
||||
- /admin
|
||||
output_alias: example
|
||||
path_overrides:
|
||||
- path: /special-page
|
||||
css_locator: .special-content
|
||||
exclude_selectors:
|
||||
- .special-ads
|
||||
output_type: single
|
||||
requests_per_second: 1.0
|
||||
burst_limit: 3
|
||||
```
|
||||
|
||||
## Examples
|
||||
@@ -111,10 +125,22 @@ scrape:
|
||||
```
|
||||
|
||||
5. Web scraping with separate output files:
|
||||
|
||||
```bash
|
||||
rollup web --urls=https://example.com,https://another-example.com --output=separate
|
||||
```
|
||||
|
||||
6. Rollup files with specific types and ignore patterns:
|
||||
|
||||
```bash
|
||||
rollup files --types=go,md --ignore=vendor/**,*_test.go
|
||||
```
|
||||
|
||||
7. Web scraping with depth and CSS selector:
|
||||
```bash
|
||||
rollup web --urls=https://example.com --depth=2 --css=.main-content
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Please feel free to submit a Pull Request.
|
||||
|
||||
186
cmd/files.go
186
cmd/files.go
@@ -1,186 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
)
|
||||
|
||||
var (
|
||||
path string
|
||||
fileTypes string
|
||||
codeGenPatterns string
|
||||
ignorePatterns string
|
||||
)
|
||||
|
||||
var filesCmd = &cobra.Command{
|
||||
Use: "files",
|
||||
Short: "Rollup files into a single Markdown file",
|
||||
Long: `The files subcommand writes the contents of all files (with target custom file types provided)
|
||||
in a given project, current path or a custom path, to a single timestamped markdown file
|
||||
whose name is <project-directory-name>-rollup-<timestamp>.md.`,
|
||||
RunE: func(cmd *cobra.Command, args []string) error {
|
||||
return runRollup()
|
||||
},
|
||||
}
|
||||
|
||||
func init() {
|
||||
filesCmd.Flags().StringVarP(&path, "path", "p", ".", "Path to the project directory")
|
||||
filesCmd.Flags().StringVarP(&fileTypes, "types", "t", ".go,.md,.txt", "Comma-separated list of file extensions to include")
|
||||
filesCmd.Flags().StringVarP(&codeGenPatterns, "codegen", "g", "", "Comma-separated list of glob patterns for code-generated files")
|
||||
filesCmd.Flags().StringVarP(&ignorePatterns, "ignore", "i", "", "Comma-separated list of glob patterns for files to ignore")
|
||||
}
|
||||
|
||||
func matchGlob(pattern, path string) bool {
|
||||
parts := strings.Split(pattern, "/")
|
||||
return matchGlobRecursive(parts, path)
|
||||
}
|
||||
|
||||
func matchGlobRecursive(patternParts []string, path string) bool {
|
||||
if len(patternParts) == 0 {
|
||||
return path == ""
|
||||
}
|
||||
|
||||
if patternParts[0] == "**" {
|
||||
for i := 0; i <= len(path); i++ {
|
||||
if matchGlobRecursive(patternParts[1:], path[i:]) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
i := strings.IndexByte(path, '/')
|
||||
if i < 0 {
|
||||
matched, _ := filepath.Match(patternParts[0], path)
|
||||
return matched && len(patternParts) == 1
|
||||
}
|
||||
|
||||
matched, _ := filepath.Match(patternParts[0], path[:i])
|
||||
return matched && matchGlobRecursive(patternParts[1:], path[i+1:])
|
||||
}
|
||||
|
||||
func isCodeGenerated(filePath string, patterns []string) bool {
|
||||
for _, pattern := range patterns {
|
||||
if strings.Contains(pattern, "**") {
|
||||
if matchGlob(pattern, filePath) {
|
||||
return true
|
||||
}
|
||||
} else {
|
||||
matched, err := filepath.Match(pattern, filepath.Base(filePath))
|
||||
if err == nil && matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isIgnored(filePath string, patterns []string) bool {
|
||||
for _, pattern := range patterns {
|
||||
if strings.Contains(pattern, "**") {
|
||||
if matchGlob(pattern, filePath) {
|
||||
return true
|
||||
}
|
||||
} else {
|
||||
matched, err := filepath.Match(pattern, filepath.Base(filePath))
|
||||
if err == nil && matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func runRollup() error {
|
||||
// Use config if available, otherwise use command-line flags
|
||||
var types, codeGenList, ignoreList []string
|
||||
if cfg != nil && len(cfg.FileTypes) > 0 {
|
||||
types = cfg.FileTypes
|
||||
} else {
|
||||
types = strings.Split(fileTypes, ",")
|
||||
}
|
||||
if cfg != nil && len(cfg.CodeGenerated) > 0 {
|
||||
codeGenList = cfg.CodeGenerated
|
||||
} else {
|
||||
codeGenList = strings.Split(codeGenPatterns, ",")
|
||||
}
|
||||
if cfg != nil && cfg.Ignore != nil && len(cfg.Ignore) > 0 {
|
||||
ignoreList = cfg.Ignore
|
||||
} else {
|
||||
ignoreList = strings.Split(ignorePatterns, ",")
|
||||
}
|
||||
|
||||
// Get the absolute path
|
||||
absPath, err := filepath.Abs(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting absolute path: %v", err)
|
||||
}
|
||||
|
||||
// Get the project directory name
|
||||
projectName := filepath.Base(absPath)
|
||||
|
||||
// Generate the output file name
|
||||
timestamp := time.Now().Format("20060102-150405")
|
||||
outputFileName := fmt.Sprintf("%s-%s.rollup.md", projectName, timestamp)
|
||||
|
||||
// Open the output file
|
||||
outputFile, err := os.Create(outputFileName)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating output file: %v", err)
|
||||
}
|
||||
defer outputFile.Close()
|
||||
|
||||
// Walk through the directory
|
||||
err = filepath.Walk(absPath, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
if strings.HasPrefix(info.Name(), ".") {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
relPath, _ := filepath.Rel(absPath, path)
|
||||
|
||||
// Check if the file should be ignored
|
||||
if isIgnored(relPath, ignoreList) {
|
||||
return nil
|
||||
}
|
||||
|
||||
ext := filepath.Ext(path)
|
||||
for _, t := range types {
|
||||
if ext == "."+t {
|
||||
// Read file contents
|
||||
content, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
fmt.Printf("Error reading file %s: %v", path, err)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if the file is code-generated
|
||||
isCodeGen := isCodeGenerated(relPath, codeGenList)
|
||||
codeGenNote := ""
|
||||
if isCodeGen {
|
||||
codeGenNote = " (Code-generated, Read-only)"
|
||||
}
|
||||
|
||||
// Write file name and contents to the output file
|
||||
fmt.Fprintf(outputFile, "# File: %s%s\n\n```%s\n%s```\n\n", relPath, codeGenNote, t, string(content))
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("error walking through directory: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Rollup complete. Output file: %s", outputFileName)
|
||||
return nil
|
||||
}
|
||||
@@ -1,80 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/tnypxl/rollup/internal/config"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
var generateCmd = &cobra.Command{
|
||||
Use: "generate",
|
||||
Short: "Generate a rollup.yml config file",
|
||||
Long: `Scan the current directory for text and code files and generate a rollup.yml config file based on the found file extensions.`,
|
||||
RunE: runGenerate,
|
||||
}
|
||||
|
||||
func runGenerate(cmd *cobra.Command, args []string) error {
|
||||
fileTypes := make(map[string]bool)
|
||||
err := filepath.Walk(".", func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !info.IsDir() {
|
||||
ext := strings.TrimPrefix(filepath.Ext(path), ".")
|
||||
if isTextFile(ext) {
|
||||
fileTypes[ext] = true
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("error walking the path: %v", err)
|
||||
}
|
||||
|
||||
cfg := config.Config{
|
||||
FileTypes: make([]string, 0, len(fileTypes)),
|
||||
Ignore: []string{"node_modules/**", "vendor/**", ".git/**"},
|
||||
}
|
||||
|
||||
for ext := range fileTypes {
|
||||
cfg.FileTypes = append(cfg.FileTypes, ext)
|
||||
}
|
||||
|
||||
// Sort file types for consistency
|
||||
sort.Strings(cfg.FileTypes)
|
||||
|
||||
yamlData, err := yaml.Marshal(&cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error marshaling config: %v", err)
|
||||
}
|
||||
|
||||
outputPath := config.DefaultConfigPath()
|
||||
err = os.WriteFile(outputPath, yamlData, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing config file: %v", err)
|
||||
}
|
||||
|
||||
fmt.Printf("Generated %s file successfully.\n", outputPath)
|
||||
return nil
|
||||
}
|
||||
|
||||
func isTextFile(ext string) bool {
|
||||
textExtensions := map[string]bool{
|
||||
"txt": true, "md": true, "go": true, "py": true, "js": true, "html": true, "css": true,
|
||||
"json": true, "xml": true, "yaml": true, "yml": true, "toml": true, "ini": true,
|
||||
"sh": true, "bash": true, "zsh": true, "fish": true,
|
||||
"c": true, "cpp": true, "h": true, "hpp": true, "java": true, "kt": true, "scala": true,
|
||||
"rs": true, "rb": true, "php": true, "ts": true, "swift": true,
|
||||
}
|
||||
return textExtensions[ext]
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Add any flags for the generate command here if needed
|
||||
}
|
||||
36
cmd/root.go
36
cmd/root.go
@@ -1,36 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"github.com/spf13/cobra"
|
||||
config "github.com/tnypxl/rollup/internal/config"
|
||||
)
|
||||
|
||||
var (
|
||||
configFile string
|
||||
cfg *config.Config
|
||||
verbose bool
|
||||
)
|
||||
|
||||
var rootCmd = &cobra.Command{
|
||||
Use: "rollup",
|
||||
Short: "Rollup is a tool for combining and processing files",
|
||||
Long: `Rollup is a versatile tool that can combine and process files in various ways.
|
||||
Use subcommands to perform specific operations.`,
|
||||
}
|
||||
|
||||
func Execute(conf *config.Config) error {
|
||||
cfg = conf
|
||||
if cfg == nil {
|
||||
cfg = &config.Config{} // Use an empty config if none is provided
|
||||
}
|
||||
return rootCmd.Execute()
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.PersistentFlags().StringVarP(&configFile, "config", "f", "", "Path to the config file (default: rollup.yml in the current directory)")
|
||||
rootCmd.PersistentFlags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose logging")
|
||||
|
||||
rootCmd.AddCommand(filesCmd)
|
||||
rootCmd.AddCommand(webCmd)
|
||||
rootCmd.AddCommand(generateCmd)
|
||||
}
|
||||
217
cmd/web.go
217
cmd/web.go
@@ -1,217 +0,0 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/tnypxl/rollup/internal/scraper"
|
||||
)
|
||||
|
||||
var (
|
||||
urls []string
|
||||
outputType string
|
||||
depth int
|
||||
includeSelector string
|
||||
excludeSelectors []string
|
||||
)
|
||||
|
||||
var scraperConfig scraper.Config
|
||||
|
||||
var webCmd = &cobra.Command{
|
||||
Use: "web",
|
||||
Short: "Scrape main content from webpages and convert to Markdown",
|
||||
Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
|
||||
RunE: runWeb,
|
||||
}
|
||||
|
||||
func init() {
|
||||
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
||||
webCmd.Flags().StringVarP(&outputType, "output", "o", "single", "Output type: 'single' for one file, 'separate' for multiple files")
|
||||
webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)")
|
||||
webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content")
|
||||
webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)")
|
||||
}
|
||||
|
||||
func runWeb(cmd *cobra.Command, args []string) error {
|
||||
scraperConfig.Verbose = verbose
|
||||
|
||||
// Use config if available, otherwise use command-line flags
|
||||
var urlConfigs []scraper.URLConfig
|
||||
if len(urls) == 0 && len(cfg.Scrape.URLs) > 0 {
|
||||
urlConfigs = make([]scraper.URLConfig, len(cfg.Scrape.URLs))
|
||||
for i, u := range cfg.Scrape.URLs {
|
||||
urlConfigs[i] = scraper.URLConfig{
|
||||
URL: u.URL,
|
||||
CSSLocator: u.CSSLocator,
|
||||
ExcludeSelectors: u.ExcludeSelectors,
|
||||
OutputAlias: u.OutputAlias,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
urlConfigs = make([]scraper.URLConfig, len(urls))
|
||||
for i, u := range urls {
|
||||
urlConfigs[i] = scraper.URLConfig{URL: u, CSSLocator: includeSelector}
|
||||
}
|
||||
}
|
||||
|
||||
if len(urlConfigs) == 0 {
|
||||
return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.urls' in the rollup.yml file")
|
||||
}
|
||||
|
||||
scraperConfig := scraper.Config{
|
||||
URLs: urlConfigs,
|
||||
OutputType: outputType,
|
||||
Verbose: verbose,
|
||||
}
|
||||
|
||||
scrapedContent, err := scraper.ScrapeMultipleURLs(scraperConfig)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error scraping content: %v", err)
|
||||
}
|
||||
|
||||
if outputType == "single" {
|
||||
return writeSingleFile(scrapedContent)
|
||||
} else {
|
||||
return writeMultipleFiles(scrapedContent)
|
||||
}
|
||||
}
|
||||
|
||||
func writeSingleFile(content map[string]string) error {
|
||||
outputFile := generateDefaultFilename(urls)
|
||||
file, err := os.Create(outputFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating output file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
for url, c := range content {
|
||||
_, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s\n\n---\n\n", url, c))
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing content to file: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile)
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeMultipleFiles(content map[string]string) error {
|
||||
for url, c := range content {
|
||||
filename := getFilenameFromContent(c, url)
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating output file %s: %v", filename, err)
|
||||
}
|
||||
|
||||
_, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s", url, c))
|
||||
file.Close()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing content to file %s: %v", filename, err)
|
||||
}
|
||||
|
||||
fmt.Printf("Content from %s has been saved to %s\n", url, filename)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func generateDefaultFilename(urls []string) string {
|
||||
timestamp := time.Now().Format("20060102-150405")
|
||||
return fmt.Sprintf("web-%s.rollup.md", timestamp)
|
||||
}
|
||||
|
||||
func scrapeRecursively(urlStr string, depth int) (string, error) {
|
||||
visited := make(map[string]bool)
|
||||
return scrapeURL(urlStr, depth, visited)
|
||||
}
|
||||
|
||||
func scrapeURL(urlStr string, depth int, visited map[string]bool) (string, error) {
|
||||
if depth < 0 || visited[urlStr] {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
visited[urlStr] = true
|
||||
|
||||
content, err := extractAndConvertContent(urlStr)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if depth > 0 {
|
||||
links, err := scraper.ExtractLinks(urlStr)
|
||||
if err != nil {
|
||||
return content, fmt.Errorf("error extracting links: %v", err)
|
||||
}
|
||||
|
||||
for _, link := range links {
|
||||
subContent, err := scrapeURL(link, depth-1, visited)
|
||||
if err != nil {
|
||||
fmt.Printf("Warning: Error scraping %s: %v\n", link, err)
|
||||
continue
|
||||
}
|
||||
content += "\n\n---\n\n" + subContent
|
||||
}
|
||||
}
|
||||
|
||||
return content, nil
|
||||
}
|
||||
|
||||
func extractAndConvertContent(urlStr string) (string, error) {
|
||||
content, err := scraper.FetchWebpageContent(urlStr)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error fetching webpage content: %v", err)
|
||||
}
|
||||
|
||||
if includeSelector != "" {
|
||||
content, err = scraper.ExtractContentWithCSS(content, includeSelector, excludeSelectors)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error extracting content with CSS: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
markdown, err := scraper.ProcessHTMLContent(content, scraper.Config{})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error processing HTML content: %v", err)
|
||||
}
|
||||
|
||||
parsedURL, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error parsing URL: %v", err)
|
||||
}
|
||||
header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
|
||||
|
||||
return header + markdown + "\n\n", nil
|
||||
}
|
||||
|
||||
func getFilenameFromContent(content, url string) string {
|
||||
// Try to extract title from content
|
||||
titleStart := strings.Index(content, "<title>")
|
||||
titleEnd := strings.Index(content, "</title>")
|
||||
if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart {
|
||||
title := content[titleStart+7 : titleEnd]
|
||||
return sanitizeFilename(title) + ".md"
|
||||
}
|
||||
|
||||
// If no title found, use the URL
|
||||
return sanitizeFilename(url) + ".md"
|
||||
}
|
||||
|
||||
func sanitizeFilename(name string) string {
|
||||
// Remove any character that isn't alphanumeric, dash, or underscore
|
||||
reg := regexp.MustCompile("[^a-zA-Z0-9-_]+")
|
||||
name = reg.ReplaceAllString(name, "_")
|
||||
|
||||
// Trim any leading or trailing underscores
|
||||
name = strings.Trim(name, "_")
|
||||
|
||||
// If the name is empty after sanitization, use a default name
|
||||
if name == "" {
|
||||
name = "untitled"
|
||||
}
|
||||
|
||||
return name
|
||||
}
|
||||
2
go.mod
2
go.mod
@@ -5,6 +5,7 @@ go 1.23
|
||||
require (
|
||||
github.com/JohannesKaufmann/html-to-markdown v1.6.0
|
||||
github.com/spf13/cobra v1.8.1
|
||||
golang.org/x/time v0.6.0
|
||||
)
|
||||
|
||||
require (
|
||||
@@ -21,7 +22,6 @@ require (
|
||||
github.com/PuerkitoBio/goquery v1.9.2
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/playwright-community/playwright-go v0.4501.1
|
||||
github.com/russross/blackfriday/v2 v2.1.0
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0
|
||||
)
|
||||
|
||||
3
go.sum
3
go.sum
@@ -32,7 +32,6 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
|
||||
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
|
||||
github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
|
||||
@@ -103,6 +102,8 @@ golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U=
|
||||
golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
FileTypes []string `yaml:"file_types"`
|
||||
Ignore []string `yaml:"ignore"`
|
||||
CodeGenerated []string `yaml:"code_generated"`
|
||||
Scrape ScrapeConfig `yaml:"scrape"`
|
||||
}
|
||||
|
||||
type ScrapeConfig struct {
|
||||
URLs []URLConfig `yaml:"urls"`
|
||||
OutputType string `yaml:"output_type"`
|
||||
}
|
||||
|
||||
type URLConfig struct {
|
||||
URL string `yaml:"url"`
|
||||
CSSLocator string `yaml:"css_locator"`
|
||||
ExcludeSelectors []string `yaml:"exclude_selectors"`
|
||||
OutputAlias string `yaml:"output_alias"`
|
||||
}
|
||||
|
||||
func Load(configPath string) (*Config, error) {
|
||||
data, err := os.ReadFile(configPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading config file: %v", err)
|
||||
}
|
||||
|
||||
var config Config
|
||||
err = yaml.Unmarshal(data, &config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing config file: %v", err)
|
||||
}
|
||||
|
||||
return &config, nil
|
||||
}
|
||||
|
||||
func DefaultConfigPath() string {
|
||||
return "rollup.yml"
|
||||
}
|
||||
|
||||
func FileExists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
@@ -1,369 +0,0 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math/rand"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
md "github.com/JohannesKaufmann/html-to-markdown"
|
||||
)
|
||||
|
||||
var logger *log.Logger
|
||||
|
||||
var (
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
)
|
||||
|
||||
// Config holds the scraper configuration
|
||||
type Config struct {
|
||||
URLs []URLConfig
|
||||
OutputType string
|
||||
Verbose bool
|
||||
}
|
||||
|
||||
// ScrapeMultipleURLs scrapes multiple URLs concurrently
|
||||
func ScrapeMultipleURLs(config Config) (map[string]string, error) {
|
||||
results := make(chan struct {
|
||||
url string
|
||||
content string
|
||||
err error
|
||||
}, len(config.URLs))
|
||||
|
||||
for _, urlConfig := range config.URLs {
|
||||
go func(cfg URLConfig) {
|
||||
content, err := scrapeURL(cfg)
|
||||
results <- struct {
|
||||
url string
|
||||
content string
|
||||
err error
|
||||
}{cfg.URL, content, err}
|
||||
}(urlConfig)
|
||||
}
|
||||
|
||||
scrapedContent := make(map[string]string)
|
||||
for i := 0; i < len(config.URLs); i++ {
|
||||
result := <-results
|
||||
if result.err != nil {
|
||||
logger.Printf("Error scraping %s: %v\n", result.url, result.err)
|
||||
continue
|
||||
}
|
||||
scrapedContent[result.url] = result.content
|
||||
}
|
||||
|
||||
return scrapedContent, nil
|
||||
}
|
||||
|
||||
func scrapeURL(config URLConfig) (string, error) {
|
||||
content, err := FetchWebpageContent(config.URL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if config.CSSLocator != "" {
|
||||
content, err = ExtractContentWithCSS(content, config.CSSLocator, config.ExcludeSelectors)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
return ProcessHTMLContent(content, Config{})
|
||||
}
|
||||
|
||||
func getFilenameFromContent(content, url string) string {
|
||||
// Try to extract title from content
|
||||
titleStart := strings.Index(content, "<title>")
|
||||
titleEnd := strings.Index(content, "</title>")
|
||||
if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart {
|
||||
title := content[titleStart+7 : titleEnd]
|
||||
return sanitizeFilename(title) + ".md"
|
||||
}
|
||||
|
||||
// If no title found, use the URL
|
||||
return sanitizeFilename(url) + ".md"
|
||||
}
|
||||
|
||||
func sanitizeFilename(name string) string {
|
||||
// Remove any character that isn't alphanumeric, dash, or underscore
|
||||
reg, _ := regexp.Compile("[^a-zA-Z0-9-_]+")
|
||||
return reg.ReplaceAllString(name, "_")
|
||||
}
|
||||
|
||||
// URLConfig holds configuration for a single URL
|
||||
type URLConfig struct {
|
||||
URL string
|
||||
CSSLocator string
|
||||
ExcludeSelectors []string
|
||||
OutputAlias string
|
||||
}
|
||||
|
||||
// SetupLogger initializes the logger based on the verbose flag
|
||||
func SetupLogger(verbose bool) {
|
||||
if verbose {
|
||||
logger = log.New(log.Writer(), "SCRAPER: ", log.LstdFlags)
|
||||
} else {
|
||||
logger = log.New(ioutil.Discard, "", 0)
|
||||
}
|
||||
}
|
||||
|
||||
// InitPlaywright initializes Playwright and launches the browser
|
||||
func InitPlaywright() error {
|
||||
logger.Println("Initializing Playwright")
|
||||
var err error
|
||||
|
||||
// Install Playwright and Chromium browser
|
||||
err = playwright.Install(&playwright.RunOptions{Browsers: []string{"chromium"}})
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not install Playwright and Chromium: %v", err)
|
||||
}
|
||||
|
||||
pw, err = playwright.Run()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not start Playwright: %v", err)
|
||||
}
|
||||
|
||||
userAgent := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
|
||||
browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Args: []string{fmt.Sprintf("--user-agent=%s", userAgent)},
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not launch browser: %v", err)
|
||||
}
|
||||
|
||||
logger.Println("Playwright initialized successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// ClosePlaywright closes the browser and stops Playwright
|
||||
func ClosePlaywright() {
|
||||
if browser != nil {
|
||||
browser.Close()
|
||||
}
|
||||
if pw != nil {
|
||||
pw.Stop()
|
||||
}
|
||||
}
|
||||
|
||||
// FetchWebpageContent retrieves the content of a webpage using Playwright
|
||||
func FetchWebpageContent(urlStr string) (string, error) {
|
||||
logger.Printf("Fetching webpage content for URL: %s\n", urlStr)
|
||||
|
||||
page, err := browser.NewPage()
|
||||
if err != nil {
|
||||
logger.Printf("Error creating new page: %v\n", err)
|
||||
return "", fmt.Errorf("could not create page: %v", err)
|
||||
}
|
||||
defer page.Close()
|
||||
|
||||
time.Sleep(time.Duration(rand.Intn(2000)+1000) * time.Millisecond)
|
||||
|
||||
logger.Printf("Navigating to URL: %s\n", urlStr)
|
||||
if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
|
||||
WaitUntil: playwright.WaitUntilStateNetworkidle,
|
||||
}); err != nil {
|
||||
logger.Printf("Error navigating to page: %v\n", err)
|
||||
return "", fmt.Errorf("could not go to page: %v", err)
|
||||
}
|
||||
|
||||
logger.Println("Waiting for page load state")
|
||||
err = page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||
State: playwright.LoadStateNetworkidle,
|
||||
})
|
||||
if err != nil {
|
||||
logger.Printf("Error waiting for page load: %v\n", err)
|
||||
return "", fmt.Errorf("error waiting for page load: %v", err)
|
||||
}
|
||||
|
||||
logger.Println("Scrolling page")
|
||||
err = scrollPage(page)
|
||||
if err != nil {
|
||||
logger.Printf("Error scrolling page: %v\n", err)
|
||||
return "", fmt.Errorf("error scrolling page: %v", err)
|
||||
}
|
||||
|
||||
logger.Println("Waiting for body element")
|
||||
_, err = page.WaitForSelector("body", playwright.PageWaitForSelectorOptions{
|
||||
State: playwright.WaitForSelectorStateVisible,
|
||||
})
|
||||
if err != nil {
|
||||
logger.Printf("Error waiting for body: %v\n", err)
|
||||
return "", fmt.Errorf("error waiting for body: %v", err)
|
||||
}
|
||||
|
||||
logger.Println("Getting page content")
|
||||
content, err := page.Content()
|
||||
if err != nil {
|
||||
logger.Printf("Error getting page content: %v\n", err)
|
||||
return "", fmt.Errorf("could not get page content: %v", err)
|
||||
}
|
||||
|
||||
if content == "" {
|
||||
logger.Println(" content is empty, falling back to body content")
|
||||
content, err = page.InnerHTML("body")
|
||||
if err != nil {
|
||||
logger.Printf("Error getting body content: %v\n", err)
|
||||
return "", fmt.Errorf("could not get body content: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
logger.Printf("Successfully fetched webpage content (length: %d)\n", len(content))
|
||||
return content, nil
|
||||
}
|
||||
|
||||
// ProcessHTMLContent converts HTML content to Markdown
|
||||
func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
|
||||
logger.Printf("Processing HTML content (length: %d)\n", len(htmlContent))
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
||||
if err != nil {
|
||||
logger.Printf("Error parsing HTML: %v\n", err)
|
||||
return "", fmt.Errorf("error parsing HTML: %v", err)
|
||||
}
|
||||
|
||||
selection := doc.Find("body")
|
||||
logger.Println("Processing entire body")
|
||||
|
||||
if selection.Length() == 0 {
|
||||
return "", fmt.Errorf("no content found in the document")
|
||||
}
|
||||
|
||||
content, err := selection.Html()
|
||||
if err != nil {
|
||||
logger.Printf("Error extracting content: %v\n", err)
|
||||
return "", fmt.Errorf("error extracting content: %v", err)
|
||||
}
|
||||
|
||||
// Create a new converter
|
||||
converter := md.NewConverter("", true, nil)
|
||||
|
||||
// Convert HTML to Markdown
|
||||
markdown, err := converter.ConvertString(content)
|
||||
if err != nil {
|
||||
logger.Printf("Error converting HTML to Markdown: %v\n", err)
|
||||
return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
|
||||
}
|
||||
|
||||
logger.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown))
|
||||
return markdown, nil
|
||||
}
|
||||
|
||||
func scrollPage(page playwright.Page) error {
|
||||
logger.Println("Starting page scroll")
|
||||
script := `
|
||||
() => {
|
||||
window.scrollTo(0, document.body.scrollHeight);
|
||||
return document.body.scrollHeight;
|
||||
}
|
||||
`
|
||||
|
||||
previousHeight := 0
|
||||
for i := 0; i < 250; i++ {
|
||||
height, err := page.Evaluate(script)
|
||||
if err != nil {
|
||||
logger.Printf("Error scrolling (iteration %d): %v\n", i+1, err)
|
||||
return fmt.Errorf("error scrolling: %v", err)
|
||||
}
|
||||
|
||||
var currentHeight int
|
||||
switch v := height.(type) {
|
||||
case int:
|
||||
currentHeight = v
|
||||
case float64:
|
||||
currentHeight = int(v)
|
||||
default:
|
||||
logger.Printf("Unexpected height type: %T\n", height)
|
||||
return fmt.Errorf("unexpected height type: %T", height)
|
||||
}
|
||||
|
||||
logger.Printf("Scroll iteration %d: height = %d\n", i+1, currentHeight)
|
||||
|
||||
if currentHeight == previousHeight {
|
||||
logger.Println("Reached bottom of the page")
|
||||
break
|
||||
}
|
||||
|
||||
previousHeight = currentHeight
|
||||
|
||||
page.WaitForTimeout(500)
|
||||
}
|
||||
|
||||
logger.Println("Scrolling back to top")
|
||||
_, err := page.Evaluate(`() => { window.scrollTo(0, 0); }`)
|
||||
if err != nil {
|
||||
logger.Printf("Error scrolling back to top: %v\n", err)
|
||||
return fmt.Errorf("error scrolling back to top: %v", err)
|
||||
}
|
||||
|
||||
logger.Println("Page scroll completed")
|
||||
return nil
|
||||
}
|
||||
|
||||
// ExtractLinks extracts all links from the given URL
|
||||
func ExtractLinks(urlStr string) ([]string, error) {
|
||||
logger.Printf("Extracting links from URL: %s\n", urlStr)
|
||||
|
||||
page, err := browser.NewPage()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not create page: %v", err)
|
||||
}
|
||||
defer page.Close()
|
||||
|
||||
if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
|
||||
WaitUntil: playwright.WaitUntilStateNetworkidle,
|
||||
}); err != nil {
|
||||
return nil, fmt.Errorf("could not go to page: %v", err)
|
||||
}
|
||||
|
||||
links, err := page.Evaluate(`() => {
|
||||
const anchors = document.querySelectorAll('a');
|
||||
return Array.from(anchors).map(a => a.href);
|
||||
}`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not extract links: %v", err)
|
||||
}
|
||||
|
||||
var result []string
|
||||
for _, link := range links.([]interface{}) {
|
||||
result = append(result, link.(string))
|
||||
}
|
||||
|
||||
logger.Printf("Extracted %d links\n", len(result))
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// ExtractContentWithCSS extracts content from HTML using a CSS selector
|
||||
func ExtractContentWithCSS(content, includeSelector string, excludeSelectors []string) (string, error) {
|
||||
logger.Printf("Extracting content with CSS selector: %s\n", includeSelector)
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(content))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error parsing HTML: %v", err)
|
||||
}
|
||||
|
||||
selection := doc.Find(includeSelector)
|
||||
if selection.Length() == 0 {
|
||||
logger.Printf("Warning: No content found with CSS selector: %s. Falling back to body content.\n", includeSelector)
|
||||
selection = doc.Find("body")
|
||||
if selection.Length() == 0 {
|
||||
return "", fmt.Errorf("no content found in body")
|
||||
}
|
||||
}
|
||||
|
||||
for _, excludeSelector := range excludeSelectors {
|
||||
selection.Find(excludeSelector).Remove()
|
||||
}
|
||||
|
||||
selectedContent, err := selection.Html()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
|
||||
}
|
||||
|
||||
logger.Printf("Extracted content length: %d\n", len(selectedContent))
|
||||
return selectedContent, nil
|
||||
}
|
||||
Reference in New Issue
Block a user