mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-15 15:03:17 +00:00
feat: Implement scraping of multiple URLs with optional CSS locators and separate output files
This commit is contained in:
86
cmd/web.go
86
cmd/web.go
@@ -12,9 +12,9 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
urls []string
|
urls []string
|
||||||
outputFile string
|
outputType string
|
||||||
depth int
|
depth int
|
||||||
includeSelector string
|
includeSelector string
|
||||||
excludeSelectors []string
|
excludeSelectors []string
|
||||||
)
|
)
|
||||||
@@ -31,7 +31,7 @@ var webCmd = &cobra.Command{
|
|||||||
func init() {
|
func init() {
|
||||||
rootCmd.AddCommand(webCmd)
|
rootCmd.AddCommand(webCmd)
|
||||||
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
||||||
webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
|
webCmd.Flags().StringVarP(&outputType, "output", "o", "single", "Output type: 'single' for one file, 'separate' for multiple files")
|
||||||
webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)")
|
webCmd.Flags().IntVarP(&depth, "depth", "d", 0, "Depth of link traversal (default: 0, only scrape the given URLs)")
|
||||||
webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content")
|
webCmd.Flags().StringVar(&includeSelector, "css", "", "CSS selector to extract specific content")
|
||||||
webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)")
|
webCmd.Flags().StringSliceVar(&excludeSelectors, "exclude", []string{}, "CSS selectors to exclude from the extracted content (comma-separated)")
|
||||||
@@ -39,44 +39,80 @@ func init() {
|
|||||||
|
|
||||||
func runWeb(cmd *cobra.Command, args []string) error {
|
func runWeb(cmd *cobra.Command, args []string) error {
|
||||||
// Use config if available, otherwise use command-line flags
|
// Use config if available, otherwise use command-line flags
|
||||||
if len(urls) == 0 && cfg.Scrape.URL != "" {
|
var urlConfigs []scraper.URLConfig
|
||||||
urls = []string{cfg.Scrape.URL}
|
if len(urls) == 0 && len(cfg.Scrape.URLs) > 0 {
|
||||||
|
urlConfigs = make([]scraper.URLConfig, len(cfg.Scrape.URLs))
|
||||||
|
for i, u := range cfg.Scrape.URLs {
|
||||||
|
urlConfigs[i] = scraper.URLConfig{
|
||||||
|
URL: u.URL,
|
||||||
|
CSSLocator: u.CSSLocator,
|
||||||
|
OutputAlias: u.OutputAlias,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
urlConfigs = make([]scraper.URLConfig, len(urls))
|
||||||
|
for i, u := range urls {
|
||||||
|
urlConfigs[i] = scraper.URLConfig{URL: u, CSSLocator: includeSelector}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(urls) == 0 {
|
if len(urlConfigs) == 0 {
|
||||||
return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file")
|
return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.urls' in the rollup.yml file")
|
||||||
}
|
}
|
||||||
|
|
||||||
if outputFile == "" {
|
scraperConfig := scraper.Config{
|
||||||
outputFile = generateDefaultFilename(urls)
|
URLs: urlConfigs,
|
||||||
|
OutputType: outputType,
|
||||||
|
Verbose: verbose,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
scrapedContent, err := scraper.ScrapeMultipleURLs(scraperConfig)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error scraping content: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if outputType == "single" {
|
||||||
|
return writeSingleFile(scrapedContent)
|
||||||
|
} else {
|
||||||
|
return writeMultipleFiles(scrapedContent)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeSingleFile(content map[string]string) error {
|
||||||
|
outputFile := generateDefaultFilename(urls)
|
||||||
file, err := os.Create(outputFile)
|
file, err := os.Create(outputFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error creating output file: %v", err)
|
return fmt.Errorf("error creating output file: %v", err)
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
for i, u := range urls {
|
for url, c := range content {
|
||||||
extractedContent, err := scrapeRecursively(u, depth)
|
_, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s\n\n---\n\n", url, c))
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error scraping content from %s: %v", u, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if i > 0 {
|
|
||||||
_, err = file.WriteString("\n\n---\n\n")
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error writing separator to file: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = file.WriteString(extractedContent)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error writing content to file: %v", err)
|
return fmt.Errorf("error writing content to file: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile)
|
fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(content), outputFile)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeMultipleFiles(content map[string]string) error {
|
||||||
|
for url, c := range content {
|
||||||
|
filename := scraper.GetFilenameFromContent(c, url)
|
||||||
|
file, err := os.Create(filename)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error creating output file %s: %v", filename, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = file.WriteString(fmt.Sprintf("# Content from %s\n\n%s", url, c))
|
||||||
|
file.Close()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error writing content to file %s: %v", filename, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Content from %s has been saved to %s\n", url, filename)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -15,8 +15,14 @@ type Config struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type ScrapeConfig struct {
|
type ScrapeConfig struct {
|
||||||
URL string `yaml:"url"`
|
URLs []URLConfig `yaml:"urls"`
|
||||||
CSSLocator string `yaml:"css_locator"`
|
OutputType string `yaml:"output_type"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type URLConfig struct {
|
||||||
|
URL string `yaml:"url"`
|
||||||
|
CSSLocator string `yaml:"css_locator"`
|
||||||
|
OutputAlias string `yaml:"output_alias"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func Load(configPath string) (*Config, error) {
|
func Load(configPath string) (*Config, error) {
|
||||||
|
|||||||
@@ -22,10 +22,85 @@ var (
|
|||||||
|
|
||||||
// Config holds the scraper configuration
|
// Config holds the scraper configuration
|
||||||
type Config struct {
|
type Config struct {
|
||||||
CSSLocator string
|
URLs []URLConfig
|
||||||
|
OutputType string
|
||||||
Verbose bool
|
Verbose bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ScrapeMultipleURLs scrapes multiple URLs concurrently
|
||||||
|
func ScrapeMultipleURLs(config Config) (map[string]string, error) {
|
||||||
|
results := make(chan struct {
|
||||||
|
url string
|
||||||
|
content string
|
||||||
|
err error
|
||||||
|
}, len(config.URLs))
|
||||||
|
|
||||||
|
for _, urlConfig := range config.URLs {
|
||||||
|
go func(cfg URLConfig) {
|
||||||
|
content, err := scrapeURL(cfg)
|
||||||
|
results <- struct {
|
||||||
|
url string
|
||||||
|
content string
|
||||||
|
err error
|
||||||
|
}{cfg.URL, content, err}
|
||||||
|
}(urlConfig)
|
||||||
|
}
|
||||||
|
|
||||||
|
scrapedContent := make(map[string]string)
|
||||||
|
for i := 0; i < len(config.URLs); i++ {
|
||||||
|
result := <-results
|
||||||
|
if result.err != nil {
|
||||||
|
logger.Printf("Error scraping %s: %v\n", result.url, result.err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
scrapedContent[result.url] = result.content
|
||||||
|
}
|
||||||
|
|
||||||
|
return scrapedContent, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func scrapeURL(config URLConfig) (string, error) {
|
||||||
|
content, err := FetchWebpageContent(config.URL)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
if config.CSSLocator != "" {
|
||||||
|
content, err = ExtractContentWithCSS(content, config.CSSLocator, nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ProcessHTMLContent(content, Config{})
|
||||||
|
}
|
||||||
|
|
||||||
|
func getFilenameFromContent(content, url string) string {
|
||||||
|
// Try to extract title from content
|
||||||
|
titleStart := strings.Index(content, "<title>")
|
||||||
|
titleEnd := strings.Index(content, "</title>")
|
||||||
|
if titleStart != -1 && titleEnd != -1 && titleEnd > titleStart {
|
||||||
|
title := content[titleStart+7 : titleEnd]
|
||||||
|
return sanitizeFilename(title) + ".md"
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no title found, use the URL
|
||||||
|
return sanitizeFilename(url) + ".md"
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeFilename(name string) string {
|
||||||
|
// Remove any character that isn't alphanumeric, dash, or underscore
|
||||||
|
reg, _ := regexp.Compile("[^a-zA-Z0-9-_]+")
|
||||||
|
return reg.ReplaceAllString(name, "_")
|
||||||
|
}
|
||||||
|
|
||||||
|
// URLConfig holds configuration for a single URL
|
||||||
|
type URLConfig struct {
|
||||||
|
URL string
|
||||||
|
CSSLocator string
|
||||||
|
OutputAlias string
|
||||||
|
}
|
||||||
|
|
||||||
// SetupLogger initializes the logger based on the verbose flag
|
// SetupLogger initializes the logger based on the verbose flag
|
||||||
func SetupLogger(verbose bool) {
|
func SetupLogger(verbose bool) {
|
||||||
if verbose {
|
if verbose {
|
||||||
|
|||||||
Reference in New Issue
Block a user