From 0163c4e50479933479dd67e0b540053a104b6dfa Mon Sep 17 00:00:00 2001 From: Arik Jones Date: Thu, 5 Sep 2024 23:41:39 -0500 Subject: [PATCH] Adds a configuration layer for use rollup.yml which may be preferred over CLI flags. --- .gitignore | 3 + cmd/web.go | 304 ++++++++++++++++++------------------ go.mod | 16 +- go.sum | 85 ++++++++++ internal/config/config.go | 45 ++++++ internal/scraper/scraper.go | 209 +++++++++++++++++++++++++ main.go | 18 +++ 7 files changed, 527 insertions(+), 153 deletions(-) create mode 100644 internal/config/config.go create mode 100644 internal/scraper/scraper.go diff --git a/.gitignore b/.gitignore index b235000..5aca2fc 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,6 @@ go.work # Ignore rollup output files *rollup*.md + +# Ignore rollup config file +rollup.yml diff --git a/cmd/web.go b/cmd/web.go index 849ecb2..9cd9f19 100644 --- a/cmd/web.go +++ b/cmd/web.go @@ -1,154 +1,156 @@ package cmd -import ( - "fmt" - "net/url" - "os" - "strings" - "time" +// TODO: Implement web scraping (DO NOT EDIT) - "github.com/JohannesKaufmann/html-to-markdown" - "github.com/spf13/cobra" - "github.com/tnypxl/rollup/internal/config" - "github.com/tnypxl/rollup/internal/scraper" -) - -var ( - urls []string - outputFile string -) - -var webCmd = &cobra.Command{ - Use: "web", - Short: "Scrape main content from webpages and convert to Markdown", - Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`, - RunE: runWeb, -} - -func init() { - rootCmd.AddCommand(webCmd) - webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") - webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-.md)") -} - -func runWeb(cmd *cobra.Command, args []string) error { - var err error - cfg, err = config.Load("rollup.yml") - if err != nil { - if os.IsNotExist(err) { - return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments") - } - return fmt.Errorf("error loading configuration: %v", err) - } - - // Use config if available, otherwise use command-line flags - if len(urls) == 0 && cfg.Scrape.URL != "" { - urls = []string{cfg.Scrape.URL} - } - - if len(urls) == 0 { - return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file") - } - - if outputFile == "" { - outputFile = generateDefaultFilename(urls) - } - - file, err := os.Create(outputFile) - if err != nil { - return fmt.Errorf("error creating output file: %v", err) - } - defer file.Close() - - for i, u := range urls { - extractedContent, err := extractAndConvertContent(u) - if err != nil { - return fmt.Errorf("error extracting and converting content from %s: %v", u, err) - } - - if i > 0 { - _, err = file.WriteString("\n\n---\n\n") - if err != nil { - return fmt.Errorf("error writing separator to file: %v", err) - } - } - - _, err = file.WriteString(extractedContent) - if err != nil { - return fmt.Errorf("error writing content to file: %v", err) - } - } - - fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile) - return nil -} - -func generateDefaultFilename(urls []string) string { - var hostnames []string - for _, u := range urls { - parsedURL, err := url.Parse(u) - if err == nil { - hostnames = append(hostnames, parsedURL.Hostname()) - } - } - - var baseFilename string - if len(hostnames) == 1 { - baseFilename = hostnames[0] - } else if len(hostnames) == 2 { - baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1]) - } else if len(hostnames) > 2 { - baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1) - } else { - baseFilename = "web-content" - } - - baseFilename = strings.NewReplacer( - ".com", "", - ".org", "", - ".net", "", - ".edu", "", - ".", "-", - ).Replace(baseFilename) - - if len(baseFilename) > 50 { - baseFilename = baseFilename[:50] - } - - timestamp := time.Now().Format("20060102-150405") - return fmt.Sprintf("%s-%s.md", baseFilename, timestamp) -} - -func extractAndConvertContent(urlStr string) (string, error) { - content, err := scraper.FetchWebpageContent(urlStr) - if err != nil { - return "", fmt.Errorf("error fetching webpage content: %v", err) - } - - // Use the CSS locator from the config - cssLocator := cfg.Scrape.CSSLocator - if cssLocator != "" { - // TODO: Implement content extraction with CSS selector - // content, err = scraper.ExtractContentWithCSS(content, cssLocator) - // if err != nil { - // return "", fmt.Errorf("error extracting content with CSS selector: %v", err) - // } - } - - // Create a new converter - converter := md.NewConverter("", true, nil) - - // Convert HTML to Markdown - markdown, err := converter.ConvertString(content) - if err != nil { - return "", fmt.Errorf("error converting HTML to Markdown: %v", err) - } - - parsedURL, err := url.Parse(urlStr) - if err != nil { - return "", fmt.Errorf("error parsing URL: %v", err) - } - header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String()) - - return header + markdown + "\n\n", nil -} +// import ( +// "fmt" +// "net/url" +// "os" +// "strings" +// "time" +// +// "github.com/JohannesKaufmann/html-to-markdown" +// "github.com/spf13/cobra" +// "github.com/tnypxl/rollup/internal/config" +// "github.com/tnypxl/rollup/internal/scraper" +// ) +// +// var ( +// urls []string +// outputFile string +// ) +// +// var webCmd = &cobra.Command{ +// Use: "web", +// Short: "Scrape main content from webpages and convert to Markdown", +// Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`, +// RunE: runWeb, +// } +// +// func init() { +// rootCmd.AddCommand(webCmd) +// webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)") +// webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-.md)") +// } +// +// func runWeb(cmd *cobra.Command, args []string) error { +// var err error +// cfg, err = config.Load("rollup.yml") +// if err != nil { +// if os.IsNotExist(err) { +// return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments") +// } +// return fmt.Errorf("error loading configuration: %v", err) +// } +// +// // Use config if available, otherwise use command-line flags +// if len(urls) == 0 && cfg.Scrape.URL != "" { +// urls = []string{cfg.Scrape.URL} +// } +// +// if len(urls) == 0 { +// return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file") +// } +// +// if outputFile == "" { +// outputFile = generateDefaultFilename(urls) +// } +// +// file, err := os.Create(outputFile) +// if err != nil { +// return fmt.Errorf("error creating output file: %v", err) +// } +// defer file.Close() +// +// for i, u := range urls { +// extractedContent, err := extractAndConvertContent(u) +// if err != nil { +// return fmt.Errorf("error extracting and converting content from %s: %v", u, err) +// } +// +// if i > 0 { +// _, err = file.WriteString("\n\n---\n\n") +// if err != nil { +// return fmt.Errorf("error writing separator to file: %v", err) +// } +// } +// +// _, err = file.WriteString(extractedContent) +// if err != nil { +// return fmt.Errorf("error writing content to file: %v", err) +// } +// } +// +// fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile) +// return nil +// } +// +// func generateDefaultFilename(urls []string) string { +// var hostnames []string +// for _, u := range urls { +// parsedURL, err := url.Parse(u) +// if err == nil { +// hostnames = append(hostnames, parsedURL.Hostname()) +// } +// } +// +// var baseFilename string +// if len(hostnames) == 1 { +// baseFilename = hostnames[0] +// } else if len(hostnames) == 2 { +// baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1]) +// } else if len(hostnames) > 2 { +// baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1) +// } else { +// baseFilename = "web-content" +// } +// +// baseFilename = strings.NewReplacer( +// ".com", "", +// ".org", "", +// ".net", "", +// ".edu", "", +// ".", "-", +// ).Replace(baseFilename) +// +// if len(baseFilename) > 50 { +// baseFilename = baseFilename[:50] +// } +// +// timestamp := time.Now().Format("20060102-150405") +// return fmt.Sprintf("%s-%s.md", baseFilename, timestamp) +// } +// +// func extractAndConvertContent(urlStr string) (string, error) { +// content, err := scraper.FetchWebpageContent(urlStr) +// if err != nil { +// return "", fmt.Errorf("error fetching webpage content: %v", err) +// } +// +// // Use the CSS locator from the config +// cssLocator := cfg.Scrape.CSSLocator +// if cssLocator != "" { +// // TODO: Implement content extraction with CSS selector +// // content, err = scraper.ExtractContentWithCSS(content, cssLocator) +// // if err != nil { +// // return "", fmt.Errorf("error extracting content with CSS selector: %v", err) +// // } +// } +// +// // Create a new converter +// converter := md.NewConverter("", true, nil) +// +// // Convert HTML to Markdown +// markdown, err := converter.ConvertString(content) +// if err != nil { +// return "", fmt.Errorf("error converting HTML to Markdown: %v", err) +// } +// +// parsedURL, err := url.Parse(urlStr) +// if err != nil { +// return "", fmt.Errorf("error parsing URL: %v", err) +// } +// header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String()) +// +// return header + markdown + "\n\n", nil +// } diff --git a/go.mod b/go.mod index 893f419..06630ea 100644 --- a/go.mod +++ b/go.mod @@ -2,12 +2,24 @@ module github.com/tnypxl/rollup go 1.23 +require github.com/spf13/cobra v1.8.1 + require ( - github.com/anthropics/anthropic-sdk-go v0.5.0 - github.com/spf13/cobra v1.8.1 + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/deckarep/golang-set/v2 v2.6.0 // indirect + github.com/go-jose/go-jose/v3 v3.0.3 // indirect + github.com/go-stack/stack v1.8.1 // indirect + github.com/kr/text v0.1.0 // indirect + go.uber.org/multierr v1.11.0 // indirect + golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect + golang.org/x/net v0.27.0 // indirect ) require ( + github.com/PuerkitoBio/goquery v1.9.2 github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/playwright-community/playwright-go v0.4501.1 + github.com/russross/blackfriday/v2 v2.1.0 github.com/spf13/pflag v1.0.5 // indirect + gopkg.in/yaml.v2 v2.4.0 ) diff --git a/go.sum b/go.sum index 912390a..00f0e3f 100644 --- a/go.sum +++ b/go.sum @@ -1,10 +1,95 @@ +github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= +github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/deckarep/golang-set/v2 v2.6.0 h1:XfcQbWM1LlMB8BsJ8N9vW5ehnnPVIw0je80NsVHagjM= +github.com/deckarep/golang-set/v2 v2.6.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= +github.com/go-jose/go-jose/v3 v3.0.3 h1:fFKWeig/irsp7XD2zBxvnmA/XaRWp5V3CBsZXJF7G7k= +github.com/go-jose/go-jose/v3 v3.0.3/go.mod h1:5b+7YgP7ZICgJDBdfjZaIt+H/9L9T/YQrVfLAMboGkQ= +github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= +github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc= +github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg= +github.com/playwright-community/playwright-go v0.4501.1 h1:kz8SIfR6nEI8blk77nTVD0K5/i37QP5rY/o8a1fG+4c= +github.com/playwright-community/playwright-go v0.4501.1/go.mod h1:bpArn5TqNzmP0jroCgw4poSOG9gSeQg490iLqWAaa7w= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM= +golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..7ecc49d --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,45 @@ +package config + +import ( + "fmt" + "os" + + "gopkg.in/yaml.v2" +) + +type Config struct { + FileTypes []string `yaml:"file_types"` + Ignore []string `yaml:"ignore"` + CodeGenerated []string `yaml:"code_generated"` + Scrape ScrapeConfig `yaml:"scrape"` +} + +type ScrapeConfig struct { + URL string `yaml:"url"` + CSSLocator string `yaml:"css_locator"` +} + +func Load(configPath string) (*Config, error) { + data, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("error reading config file: %v", err) + } + + var config Config + err = yaml.Unmarshal(data, &config) + if err != nil { + return nil, fmt.Errorf("error parsing config file: %v", err) + } + + return &config, nil +} + +func DefaultConfigPath() string { + return "rollup.yml" +} + +func FileExists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go new file mode 100644 index 0000000..1f0e7c2 --- /dev/null +++ b/internal/scraper/scraper.go @@ -0,0 +1,209 @@ +package scraper + +import ( + "fmt" + "log" + "math/rand" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/playwright-community/playwright-go" + "github.com/russross/blackfriday/v2" +) + +var ( + pw *playwright.Playwright + browser playwright.Browser +) + +// Config holds the scraper configuration +type Config struct { + CSSLocator string +} + +// InitPlaywright initializes Playwright and launches the browser +func InitPlaywright() error { + log.Println("Initializing Playwright") + var err error + pw, err = playwright.Run() + if err != nil { + return fmt.Errorf("could not start Playwright: %v", err) + } + + userAgent := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + + browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{ + Args: []string{fmt.Sprintf("--user-agent=%s", userAgent)}, + }) + if err != nil { + return fmt.Errorf("could not launch browser: %v", err) + } + + log.Println("Playwright initialized successfully") + return nil +} + +// ClosePlaywright closes the browser and stops Playwright +func ClosePlaywright() { + if browser != nil { + browser.Close() + } + if pw != nil { + pw.Stop() + } +} + +// FetchWebpageContent retrieves the content of a webpage using Playwright +func FetchWebpageContent(urlStr string) (string, error) { + log.Printf("Fetching webpage content for URL: %s\n", urlStr) + + page, err := browser.NewPage() + if err != nil { + log.Printf("Error creating new page: %v\n", err) + return "", fmt.Errorf("could not create page: %v", err) + } + defer page.Close() + + err = page.EmulateMedia(playwright.PageEmulateMediaOptions{ + Media: playwright.MediaPrint, + }) + if err != nil { + log.Printf("Error emulating print media: %v\n", err) + return "", fmt.Errorf("could not emulate print media: %v", err) + } + + time.Sleep(time.Duration(rand.Intn(2000)+1000) * time.Millisecond) + + log.Printf("Navigating to URL: %s\n", urlStr) + if _, err = page.Goto(urlStr, playwright.PageGotoOptions{ + WaitUntil: playwright.WaitUntilStateNetworkidle, + }); err != nil { + log.Printf("Error navigating to page: %v\n", err) + return "", fmt.Errorf("could not go to page: %v", err) + } + + log.Println("Waiting for page load state") + err = page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{ + State: playwright.LoadStateNetworkidle, + }) + if err != nil { + log.Printf("Error waiting for page load: %v\n", err) + return "", fmt.Errorf("error waiting for page load: %v", err) + } + + log.Println("Scrolling page") + err = scrollPage(page) + if err != nil { + log.Printf("Error scrolling page: %v\n", err) + return "", fmt.Errorf("error scrolling page: %v", err) + } + + log.Println("Waiting for body element") + _, err = page.WaitForSelector("body", playwright.PageWaitForSelectorOptions{ + State: playwright.WaitForSelectorStateVisible, + }) + if err != nil { + log.Printf("Error waiting for body: %v\n", err) + return "", fmt.Errorf("error waiting for body: %v", err) + } + + log.Println("Getting page content") + content, err := page.Content() + if err != nil { + log.Printf("Error getting page content: %v\n", err) + return "", fmt.Errorf("could not get page content: %v", err) + } + + log.Printf("Successfully fetched webpage content (length: %d)\n", len(content)) + return content, nil +} + +// ProcessHTMLContent converts HTML content to Markdown +func ProcessHTMLContent(htmlContent string, config Config) (string, error) { + log.Printf("Processing HTML content (length: %d)\n", len(htmlContent)) + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) + if err != nil { + log.Printf("Error parsing HTML: %v\n", err) + return "", fmt.Errorf("error parsing HTML: %v", err) + } + + var content string + if config.CSSLocator != "" { + log.Printf("Using CSS locator: %s\n", config.CSSLocator) + content, err = doc.Find(config.CSSLocator).Html() + if err != nil { + log.Printf("Error extracting content with CSS locator: %v\n", err) + return "", fmt.Errorf("error extracting content with CSS locator: %v", err) + } + } else { + log.Println("No CSS locator provided, processing entire body") + content, err = doc.Find("body").Html() + if err != nil { + log.Printf("Error extracting body content: %v\n", err) + return "", fmt.Errorf("error extracting body content: %v", err) + } + } + + markdown := convertToMarkdown(content) + log.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown)) + return markdown, nil +} + +func convertToMarkdown(html string) string { + // Use a simple HTML-to-Markdown conversion + markdown := blackfriday.Run([]byte(html), + blackfriday.WithExtensions(blackfriday.CommonExtensions|blackfriday.HardLineBreak)) + return string(markdown) +} + +func scrollPage(page playwright.Page) error { + log.Println("Starting page scroll") + script := ` + () => { + window.scrollTo(0, document.body.scrollHeight); + return document.body.scrollHeight; + } + ` + + previousHeight := 0 + for i := 0; i < 250; i++ { + height, err := page.Evaluate(script) + if err != nil { + log.Printf("Error scrolling (iteration %d): %v\n", i+1, err) + return fmt.Errorf("error scrolling: %v", err) + } + + var currentHeight int + switch v := height.(type) { + case int: + currentHeight = v + case float64: + currentHeight = int(v) + default: + log.Printf("Unexpected height type: %T\n", height) + return fmt.Errorf("unexpected height type: %T", height) + } + + log.Printf("Scroll iteration %d: height = %d\n", i+1, currentHeight) + + if currentHeight == previousHeight { + log.Println("Reached bottom of the page") + break + } + + previousHeight = currentHeight + + page.WaitForTimeout(500) + } + + log.Println("Scrolling back to top") + _, err := page.Evaluate(`() => { window.scrollTo(0, 0); }`) + if err != nil { + log.Printf("Error scrolling back to top: %v\n", err) + return fmt.Errorf("error scrolling back to top: %v", err) + } + + log.Println("Page scroll completed") + return nil +} diff --git a/main.go b/main.go index 3519178..ec2166c 100644 --- a/main.go +++ b/main.go @@ -2,12 +2,30 @@ package main import ( "fmt" + "log" "os" "github.com/tnypxl/rollup/cmd" + "github.com/tnypxl/rollup/internal/config" + "github.com/tnypxl/rollup/internal/scraper" ) +var cfg *config.Config + func main() { + configPath := config.DefaultConfigPath() + var err error + cfg, err = config.Load(configPath) + if err != nil { + log.Fatalf("Failed to load configuration: %v", err) + } + + err = scraper.InitPlaywright() + if err != nil { + log.Fatalf("Failed to initialize Playwright: %v", err) + } + defer scraper.ClosePlaywright() + if err := cmd.Execute(); err != nil { fmt.Println(err) os.Exit(1)