mirror of
https://github.com/tnypxl/rollup.git
synced 2025-12-13 06:23:18 +00:00
Adds a configuration layer for use rollup.yml which may be preferred over CLI flags.
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -19,3 +19,6 @@ go.work
|
||||
|
||||
# Ignore rollup output files
|
||||
*rollup*.md
|
||||
|
||||
# Ignore rollup config file
|
||||
rollup.yml
|
||||
|
||||
304
cmd/web.go
304
cmd/web.go
@@ -1,154 +1,156 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
// TODO: Implement web scraping (DO NOT EDIT)
|
||||
|
||||
"github.com/JohannesKaufmann/html-to-markdown"
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/tnypxl/rollup/internal/config"
|
||||
"github.com/tnypxl/rollup/internal/scraper"
|
||||
)
|
||||
|
||||
var (
|
||||
urls []string
|
||||
outputFile string
|
||||
)
|
||||
|
||||
var webCmd = &cobra.Command{
|
||||
Use: "web",
|
||||
Short: "Scrape main content from webpages and convert to Markdown",
|
||||
Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
|
||||
RunE: runWeb,
|
||||
}
|
||||
|
||||
func init() {
|
||||
rootCmd.AddCommand(webCmd)
|
||||
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
||||
webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
|
||||
}
|
||||
|
||||
func runWeb(cmd *cobra.Command, args []string) error {
|
||||
var err error
|
||||
cfg, err = config.Load("rollup.yml")
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments")
|
||||
}
|
||||
return fmt.Errorf("error loading configuration: %v", err)
|
||||
}
|
||||
|
||||
// Use config if available, otherwise use command-line flags
|
||||
if len(urls) == 0 && cfg.Scrape.URL != "" {
|
||||
urls = []string{cfg.Scrape.URL}
|
||||
}
|
||||
|
||||
if len(urls) == 0 {
|
||||
return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file")
|
||||
}
|
||||
|
||||
if outputFile == "" {
|
||||
outputFile = generateDefaultFilename(urls)
|
||||
}
|
||||
|
||||
file, err := os.Create(outputFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating output file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
for i, u := range urls {
|
||||
extractedContent, err := extractAndConvertContent(u)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error extracting and converting content from %s: %v", u, err)
|
||||
}
|
||||
|
||||
if i > 0 {
|
||||
_, err = file.WriteString("\n\n---\n\n")
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing separator to file: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
_, err = file.WriteString(extractedContent)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing content to file: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile)
|
||||
return nil
|
||||
}
|
||||
|
||||
func generateDefaultFilename(urls []string) string {
|
||||
var hostnames []string
|
||||
for _, u := range urls {
|
||||
parsedURL, err := url.Parse(u)
|
||||
if err == nil {
|
||||
hostnames = append(hostnames, parsedURL.Hostname())
|
||||
}
|
||||
}
|
||||
|
||||
var baseFilename string
|
||||
if len(hostnames) == 1 {
|
||||
baseFilename = hostnames[0]
|
||||
} else if len(hostnames) == 2 {
|
||||
baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1])
|
||||
} else if len(hostnames) > 2 {
|
||||
baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1)
|
||||
} else {
|
||||
baseFilename = "web-content"
|
||||
}
|
||||
|
||||
baseFilename = strings.NewReplacer(
|
||||
".com", "",
|
||||
".org", "",
|
||||
".net", "",
|
||||
".edu", "",
|
||||
".", "-",
|
||||
).Replace(baseFilename)
|
||||
|
||||
if len(baseFilename) > 50 {
|
||||
baseFilename = baseFilename[:50]
|
||||
}
|
||||
|
||||
timestamp := time.Now().Format("20060102-150405")
|
||||
return fmt.Sprintf("%s-%s.md", baseFilename, timestamp)
|
||||
}
|
||||
|
||||
func extractAndConvertContent(urlStr string) (string, error) {
|
||||
content, err := scraper.FetchWebpageContent(urlStr)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error fetching webpage content: %v", err)
|
||||
}
|
||||
|
||||
// Use the CSS locator from the config
|
||||
cssLocator := cfg.Scrape.CSSLocator
|
||||
if cssLocator != "" {
|
||||
// TODO: Implement content extraction with CSS selector
|
||||
// content, err = scraper.ExtractContentWithCSS(content, cssLocator)
|
||||
// if err != nil {
|
||||
// return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
|
||||
// }
|
||||
}
|
||||
|
||||
// Create a new converter
|
||||
converter := md.NewConverter("", true, nil)
|
||||
|
||||
// Convert HTML to Markdown
|
||||
markdown, err := converter.ConvertString(content)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
|
||||
}
|
||||
|
||||
parsedURL, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error parsing URL: %v", err)
|
||||
}
|
||||
header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
|
||||
|
||||
return header + markdown + "\n\n", nil
|
||||
}
|
||||
// import (
|
||||
// "fmt"
|
||||
// "net/url"
|
||||
// "os"
|
||||
// "strings"
|
||||
// "time"
|
||||
//
|
||||
// "github.com/JohannesKaufmann/html-to-markdown"
|
||||
// "github.com/spf13/cobra"
|
||||
// "github.com/tnypxl/rollup/internal/config"
|
||||
// "github.com/tnypxl/rollup/internal/scraper"
|
||||
// )
|
||||
//
|
||||
// var (
|
||||
// urls []string
|
||||
// outputFile string
|
||||
// )
|
||||
//
|
||||
// var webCmd = &cobra.Command{
|
||||
// Use: "web",
|
||||
// Short: "Scrape main content from webpages and convert to Markdown",
|
||||
// Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
|
||||
// RunE: runWeb,
|
||||
// }
|
||||
//
|
||||
// func init() {
|
||||
// rootCmd.AddCommand(webCmd)
|
||||
// webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
|
||||
// webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
|
||||
// }
|
||||
//
|
||||
// func runWeb(cmd *cobra.Command, args []string) error {
|
||||
// var err error
|
||||
// cfg, err = config.Load("rollup.yml")
|
||||
// if err != nil {
|
||||
// if os.IsNotExist(err) {
|
||||
// return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments")
|
||||
// }
|
||||
// return fmt.Errorf("error loading configuration: %v", err)
|
||||
// }
|
||||
//
|
||||
// // Use config if available, otherwise use command-line flags
|
||||
// if len(urls) == 0 && cfg.Scrape.URL != "" {
|
||||
// urls = []string{cfg.Scrape.URL}
|
||||
// }
|
||||
//
|
||||
// if len(urls) == 0 {
|
||||
// return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file")
|
||||
// }
|
||||
//
|
||||
// if outputFile == "" {
|
||||
// outputFile = generateDefaultFilename(urls)
|
||||
// }
|
||||
//
|
||||
// file, err := os.Create(outputFile)
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("error creating output file: %v", err)
|
||||
// }
|
||||
// defer file.Close()
|
||||
//
|
||||
// for i, u := range urls {
|
||||
// extractedContent, err := extractAndConvertContent(u)
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("error extracting and converting content from %s: %v", u, err)
|
||||
// }
|
||||
//
|
||||
// if i > 0 {
|
||||
// _, err = file.WriteString("\n\n---\n\n")
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("error writing separator to file: %v", err)
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// _, err = file.WriteString(extractedContent)
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("error writing content to file: %v", err)
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile)
|
||||
// return nil
|
||||
// }
|
||||
//
|
||||
// func generateDefaultFilename(urls []string) string {
|
||||
// var hostnames []string
|
||||
// for _, u := range urls {
|
||||
// parsedURL, err := url.Parse(u)
|
||||
// if err == nil {
|
||||
// hostnames = append(hostnames, parsedURL.Hostname())
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// var baseFilename string
|
||||
// if len(hostnames) == 1 {
|
||||
// baseFilename = hostnames[0]
|
||||
// } else if len(hostnames) == 2 {
|
||||
// baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1])
|
||||
// } else if len(hostnames) > 2 {
|
||||
// baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1)
|
||||
// } else {
|
||||
// baseFilename = "web-content"
|
||||
// }
|
||||
//
|
||||
// baseFilename = strings.NewReplacer(
|
||||
// ".com", "",
|
||||
// ".org", "",
|
||||
// ".net", "",
|
||||
// ".edu", "",
|
||||
// ".", "-",
|
||||
// ).Replace(baseFilename)
|
||||
//
|
||||
// if len(baseFilename) > 50 {
|
||||
// baseFilename = baseFilename[:50]
|
||||
// }
|
||||
//
|
||||
// timestamp := time.Now().Format("20060102-150405")
|
||||
// return fmt.Sprintf("%s-%s.md", baseFilename, timestamp)
|
||||
// }
|
||||
//
|
||||
// func extractAndConvertContent(urlStr string) (string, error) {
|
||||
// content, err := scraper.FetchWebpageContent(urlStr)
|
||||
// if err != nil {
|
||||
// return "", fmt.Errorf("error fetching webpage content: %v", err)
|
||||
// }
|
||||
//
|
||||
// // Use the CSS locator from the config
|
||||
// cssLocator := cfg.Scrape.CSSLocator
|
||||
// if cssLocator != "" {
|
||||
// // TODO: Implement content extraction with CSS selector
|
||||
// // content, err = scraper.ExtractContentWithCSS(content, cssLocator)
|
||||
// // if err != nil {
|
||||
// // return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
|
||||
// // }
|
||||
// }
|
||||
//
|
||||
// // Create a new converter
|
||||
// converter := md.NewConverter("", true, nil)
|
||||
//
|
||||
// // Convert HTML to Markdown
|
||||
// markdown, err := converter.ConvertString(content)
|
||||
// if err != nil {
|
||||
// return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
|
||||
// }
|
||||
//
|
||||
// parsedURL, err := url.Parse(urlStr)
|
||||
// if err != nil {
|
||||
// return "", fmt.Errorf("error parsing URL: %v", err)
|
||||
// }
|
||||
// header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
|
||||
//
|
||||
// return header + markdown + "\n\n", nil
|
||||
// }
|
||||
|
||||
16
go.mod
16
go.mod
@@ -2,12 +2,24 @@ module github.com/tnypxl/rollup
|
||||
|
||||
go 1.23
|
||||
|
||||
require github.com/spf13/cobra v1.8.1
|
||||
|
||||
require (
|
||||
github.com/anthropics/anthropic-sdk-go v0.5.0
|
||||
github.com/spf13/cobra v1.8.1
|
||||
github.com/andybalholm/cascadia v1.3.2 // indirect
|
||||
github.com/deckarep/golang-set/v2 v2.6.0 // indirect
|
||||
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
|
||||
github.com/go-stack/stack v1.8.1 // indirect
|
||||
github.com/kr/text v0.1.0 // indirect
|
||||
go.uber.org/multierr v1.11.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect
|
||||
golang.org/x/net v0.27.0 // indirect
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.9.2
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/playwright-community/playwright-go v0.4501.1
|
||||
github.com/russross/blackfriday/v2 v2.1.0
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0
|
||||
)
|
||||
|
||||
85
go.sum
85
go.sum
@@ -1,10 +1,95 @@
|
||||
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
|
||||
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
|
||||
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
|
||||
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/deckarep/golang-set/v2 v2.6.0 h1:XfcQbWM1LlMB8BsJ8N9vW5ehnnPVIw0je80NsVHagjM=
|
||||
github.com/deckarep/golang-set/v2 v2.6.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4=
|
||||
github.com/go-jose/go-jose/v3 v3.0.3 h1:fFKWeig/irsp7XD2zBxvnmA/XaRWp5V3CBsZXJF7G7k=
|
||||
github.com/go-jose/go-jose/v3 v3.0.3/go.mod h1:5b+7YgP7ZICgJDBdfjZaIt+H/9L9T/YQrVfLAMboGkQ=
|
||||
github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw=
|
||||
github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4=
|
||||
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
|
||||
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc=
|
||||
github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg=
|
||||
github.com/playwright-community/playwright-go v0.4501.1 h1:kz8SIfR6nEI8blk77nTVD0K5/i37QP5rY/o8a1fG+4c=
|
||||
github.com/playwright-community/playwright-go v0.4501.1/go.mod h1:bpArn5TqNzmP0jroCgw4poSOG9gSeQg490iLqWAaa7w=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
|
||||
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
|
||||
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
|
||||
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
|
||||
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
|
||||
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
|
||||
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM=
|
||||
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
|
||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||
golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
|
||||
golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
|
||||
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
|
||||
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
||||
45
internal/config/config.go
Normal file
45
internal/config/config.go
Normal file
@@ -0,0 +1,45 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
FileTypes []string `yaml:"file_types"`
|
||||
Ignore []string `yaml:"ignore"`
|
||||
CodeGenerated []string `yaml:"code_generated"`
|
||||
Scrape ScrapeConfig `yaml:"scrape"`
|
||||
}
|
||||
|
||||
type ScrapeConfig struct {
|
||||
URL string `yaml:"url"`
|
||||
CSSLocator string `yaml:"css_locator"`
|
||||
}
|
||||
|
||||
func Load(configPath string) (*Config, error) {
|
||||
data, err := os.ReadFile(configPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading config file: %v", err)
|
||||
}
|
||||
|
||||
var config Config
|
||||
err = yaml.Unmarshal(data, &config)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing config file: %v", err)
|
||||
}
|
||||
|
||||
return &config, nil
|
||||
}
|
||||
|
||||
func DefaultConfigPath() string {
|
||||
return "rollup.yml"
|
||||
}
|
||||
|
||||
func FileExists(filename string) bool {
|
||||
_, err := os.Stat(filename)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
209
internal/scraper/scraper.go
Normal file
209
internal/scraper/scraper.go
Normal file
@@ -0,0 +1,209 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"math/rand"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/playwright-community/playwright-go"
|
||||
"github.com/russross/blackfriday/v2"
|
||||
)
|
||||
|
||||
var (
|
||||
pw *playwright.Playwright
|
||||
browser playwright.Browser
|
||||
)
|
||||
|
||||
// Config holds the scraper configuration
|
||||
type Config struct {
|
||||
CSSLocator string
|
||||
}
|
||||
|
||||
// InitPlaywright initializes Playwright and launches the browser
|
||||
func InitPlaywright() error {
|
||||
log.Println("Initializing Playwright")
|
||||
var err error
|
||||
pw, err = playwright.Run()
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not start Playwright: %v", err)
|
||||
}
|
||||
|
||||
userAgent := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
|
||||
browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
|
||||
Args: []string{fmt.Sprintf("--user-agent=%s", userAgent)},
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not launch browser: %v", err)
|
||||
}
|
||||
|
||||
log.Println("Playwright initialized successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// ClosePlaywright closes the browser and stops Playwright
|
||||
func ClosePlaywright() {
|
||||
if browser != nil {
|
||||
browser.Close()
|
||||
}
|
||||
if pw != nil {
|
||||
pw.Stop()
|
||||
}
|
||||
}
|
||||
|
||||
// FetchWebpageContent retrieves the content of a webpage using Playwright
|
||||
func FetchWebpageContent(urlStr string) (string, error) {
|
||||
log.Printf("Fetching webpage content for URL: %s\n", urlStr)
|
||||
|
||||
page, err := browser.NewPage()
|
||||
if err != nil {
|
||||
log.Printf("Error creating new page: %v\n", err)
|
||||
return "", fmt.Errorf("could not create page: %v", err)
|
||||
}
|
||||
defer page.Close()
|
||||
|
||||
err = page.EmulateMedia(playwright.PageEmulateMediaOptions{
|
||||
Media: playwright.MediaPrint,
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Error emulating print media: %v\n", err)
|
||||
return "", fmt.Errorf("could not emulate print media: %v", err)
|
||||
}
|
||||
|
||||
time.Sleep(time.Duration(rand.Intn(2000)+1000) * time.Millisecond)
|
||||
|
||||
log.Printf("Navigating to URL: %s\n", urlStr)
|
||||
if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
|
||||
WaitUntil: playwright.WaitUntilStateNetworkidle,
|
||||
}); err != nil {
|
||||
log.Printf("Error navigating to page: %v\n", err)
|
||||
return "", fmt.Errorf("could not go to page: %v", err)
|
||||
}
|
||||
|
||||
log.Println("Waiting for page load state")
|
||||
err = page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
|
||||
State: playwright.LoadStateNetworkidle,
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Error waiting for page load: %v\n", err)
|
||||
return "", fmt.Errorf("error waiting for page load: %v", err)
|
||||
}
|
||||
|
||||
log.Println("Scrolling page")
|
||||
err = scrollPage(page)
|
||||
if err != nil {
|
||||
log.Printf("Error scrolling page: %v\n", err)
|
||||
return "", fmt.Errorf("error scrolling page: %v", err)
|
||||
}
|
||||
|
||||
log.Println("Waiting for body element")
|
||||
_, err = page.WaitForSelector("body", playwright.PageWaitForSelectorOptions{
|
||||
State: playwright.WaitForSelectorStateVisible,
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("Error waiting for body: %v\n", err)
|
||||
return "", fmt.Errorf("error waiting for body: %v", err)
|
||||
}
|
||||
|
||||
log.Println("Getting page content")
|
||||
content, err := page.Content()
|
||||
if err != nil {
|
||||
log.Printf("Error getting page content: %v\n", err)
|
||||
return "", fmt.Errorf("could not get page content: %v", err)
|
||||
}
|
||||
|
||||
log.Printf("Successfully fetched webpage content (length: %d)\n", len(content))
|
||||
return content, nil
|
||||
}
|
||||
|
||||
// ProcessHTMLContent converts HTML content to Markdown
|
||||
func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
|
||||
log.Printf("Processing HTML content (length: %d)\n", len(htmlContent))
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
|
||||
if err != nil {
|
||||
log.Printf("Error parsing HTML: %v\n", err)
|
||||
return "", fmt.Errorf("error parsing HTML: %v", err)
|
||||
}
|
||||
|
||||
var content string
|
||||
if config.CSSLocator != "" {
|
||||
log.Printf("Using CSS locator: %s\n", config.CSSLocator)
|
||||
content, err = doc.Find(config.CSSLocator).Html()
|
||||
if err != nil {
|
||||
log.Printf("Error extracting content with CSS locator: %v\n", err)
|
||||
return "", fmt.Errorf("error extracting content with CSS locator: %v", err)
|
||||
}
|
||||
} else {
|
||||
log.Println("No CSS locator provided, processing entire body")
|
||||
content, err = doc.Find("body").Html()
|
||||
if err != nil {
|
||||
log.Printf("Error extracting body content: %v\n", err)
|
||||
return "", fmt.Errorf("error extracting body content: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
markdown := convertToMarkdown(content)
|
||||
log.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown))
|
||||
return markdown, nil
|
||||
}
|
||||
|
||||
func convertToMarkdown(html string) string {
|
||||
// Use a simple HTML-to-Markdown conversion
|
||||
markdown := blackfriday.Run([]byte(html),
|
||||
blackfriday.WithExtensions(blackfriday.CommonExtensions|blackfriday.HardLineBreak))
|
||||
return string(markdown)
|
||||
}
|
||||
|
||||
func scrollPage(page playwright.Page) error {
|
||||
log.Println("Starting page scroll")
|
||||
script := `
|
||||
() => {
|
||||
window.scrollTo(0, document.body.scrollHeight);
|
||||
return document.body.scrollHeight;
|
||||
}
|
||||
`
|
||||
|
||||
previousHeight := 0
|
||||
for i := 0; i < 250; i++ {
|
||||
height, err := page.Evaluate(script)
|
||||
if err != nil {
|
||||
log.Printf("Error scrolling (iteration %d): %v\n", i+1, err)
|
||||
return fmt.Errorf("error scrolling: %v", err)
|
||||
}
|
||||
|
||||
var currentHeight int
|
||||
switch v := height.(type) {
|
||||
case int:
|
||||
currentHeight = v
|
||||
case float64:
|
||||
currentHeight = int(v)
|
||||
default:
|
||||
log.Printf("Unexpected height type: %T\n", height)
|
||||
return fmt.Errorf("unexpected height type: %T", height)
|
||||
}
|
||||
|
||||
log.Printf("Scroll iteration %d: height = %d\n", i+1, currentHeight)
|
||||
|
||||
if currentHeight == previousHeight {
|
||||
log.Println("Reached bottom of the page")
|
||||
break
|
||||
}
|
||||
|
||||
previousHeight = currentHeight
|
||||
|
||||
page.WaitForTimeout(500)
|
||||
}
|
||||
|
||||
log.Println("Scrolling back to top")
|
||||
_, err := page.Evaluate(`() => { window.scrollTo(0, 0); }`)
|
||||
if err != nil {
|
||||
log.Printf("Error scrolling back to top: %v\n", err)
|
||||
return fmt.Errorf("error scrolling back to top: %v", err)
|
||||
}
|
||||
|
||||
log.Println("Page scroll completed")
|
||||
return nil
|
||||
}
|
||||
18
main.go
18
main.go
@@ -2,12 +2,30 @@ package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"github.com/tnypxl/rollup/cmd"
|
||||
"github.com/tnypxl/rollup/internal/config"
|
||||
"github.com/tnypxl/rollup/internal/scraper"
|
||||
)
|
||||
|
||||
var cfg *config.Config
|
||||
|
||||
func main() {
|
||||
configPath := config.DefaultConfigPath()
|
||||
var err error
|
||||
cfg, err = config.Load(configPath)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to load configuration: %v", err)
|
||||
}
|
||||
|
||||
err = scraper.InitPlaywright()
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to initialize Playwright: %v", err)
|
||||
}
|
||||
defer scraper.ClosePlaywright()
|
||||
|
||||
if err := cmd.Execute(); err != nil {
|
||||
fmt.Println(err)
|
||||
os.Exit(1)
|
||||
|
||||
Reference in New Issue
Block a user