Adds a configuration layer for use rollup.yml which may be preferred over CLI flags.

This commit is contained in:
Arik Jones
2024-09-05 23:41:39 -05:00
parent f376f186c2
commit 0163c4e504
7 changed files with 527 additions and 153 deletions

3
.gitignore vendored
View File

@@ -19,3 +19,6 @@ go.work
# Ignore rollup output files
*rollup*.md
# Ignore rollup config file
rollup.yml

View File

@@ -1,154 +1,156 @@
package cmd
import (
"fmt"
"net/url"
"os"
"strings"
"time"
// TODO: Implement web scraping (DO NOT EDIT)
"github.com/JohannesKaufmann/html-to-markdown"
"github.com/spf13/cobra"
"github.com/tnypxl/rollup/internal/config"
"github.com/tnypxl/rollup/internal/scraper"
)
var (
urls []string
outputFile string
)
var webCmd = &cobra.Command{
Use: "web",
Short: "Scrape main content from webpages and convert to Markdown",
Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
RunE: runWeb,
}
func init() {
rootCmd.AddCommand(webCmd)
webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
}
func runWeb(cmd *cobra.Command, args []string) error {
var err error
cfg, err = config.Load("rollup.yml")
if err != nil {
if os.IsNotExist(err) {
return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments")
}
return fmt.Errorf("error loading configuration: %v", err)
}
// Use config if available, otherwise use command-line flags
if len(urls) == 0 && cfg.Scrape.URL != "" {
urls = []string{cfg.Scrape.URL}
}
if len(urls) == 0 {
return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file")
}
if outputFile == "" {
outputFile = generateDefaultFilename(urls)
}
file, err := os.Create(outputFile)
if err != nil {
return fmt.Errorf("error creating output file: %v", err)
}
defer file.Close()
for i, u := range urls {
extractedContent, err := extractAndConvertContent(u)
if err != nil {
return fmt.Errorf("error extracting and converting content from %s: %v", u, err)
}
if i > 0 {
_, err = file.WriteString("\n\n---\n\n")
if err != nil {
return fmt.Errorf("error writing separator to file: %v", err)
}
}
_, err = file.WriteString(extractedContent)
if err != nil {
return fmt.Errorf("error writing content to file: %v", err)
}
}
fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile)
return nil
}
func generateDefaultFilename(urls []string) string {
var hostnames []string
for _, u := range urls {
parsedURL, err := url.Parse(u)
if err == nil {
hostnames = append(hostnames, parsedURL.Hostname())
}
}
var baseFilename string
if len(hostnames) == 1 {
baseFilename = hostnames[0]
} else if len(hostnames) == 2 {
baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1])
} else if len(hostnames) > 2 {
baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1)
} else {
baseFilename = "web-content"
}
baseFilename = strings.NewReplacer(
".com", "",
".org", "",
".net", "",
".edu", "",
".", "-",
).Replace(baseFilename)
if len(baseFilename) > 50 {
baseFilename = baseFilename[:50]
}
timestamp := time.Now().Format("20060102-150405")
return fmt.Sprintf("%s-%s.md", baseFilename, timestamp)
}
func extractAndConvertContent(urlStr string) (string, error) {
content, err := scraper.FetchWebpageContent(urlStr)
if err != nil {
return "", fmt.Errorf("error fetching webpage content: %v", err)
}
// Use the CSS locator from the config
cssLocator := cfg.Scrape.CSSLocator
if cssLocator != "" {
// TODO: Implement content extraction with CSS selector
// content, err = scraper.ExtractContentWithCSS(content, cssLocator)
// if err != nil {
// return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
// }
}
// Create a new converter
converter := md.NewConverter("", true, nil)
// Convert HTML to Markdown
markdown, err := converter.ConvertString(content)
if err != nil {
return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
}
parsedURL, err := url.Parse(urlStr)
if err != nil {
return "", fmt.Errorf("error parsing URL: %v", err)
}
header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
return header + markdown + "\n\n", nil
}
// import (
// "fmt"
// "net/url"
// "os"
// "strings"
// "time"
//
// "github.com/JohannesKaufmann/html-to-markdown"
// "github.com/spf13/cobra"
// "github.com/tnypxl/rollup/internal/config"
// "github.com/tnypxl/rollup/internal/scraper"
// )
//
// var (
// urls []string
// outputFile string
// )
//
// var webCmd = &cobra.Command{
// Use: "web",
// Short: "Scrape main content from webpages and convert to Markdown",
// Long: `Scrape the main content from one or more webpages, ignoring navigational elements, ads, and other UI aspects. Convert the content to a well-structured Markdown file.`,
// RunE: runWeb,
// }
//
// func init() {
// rootCmd.AddCommand(webCmd)
// webCmd.Flags().StringSliceVarP(&urls, "urls", "u", []string{}, "URLs of the webpages to scrape (comma-separated)")
// webCmd.Flags().StringVarP(&outputFile, "output", "o", "", "Output Markdown file (default: rollup-web-<timestamp>.md)")
// }
//
// func runWeb(cmd *cobra.Command, args []string) error {
// var err error
// cfg, err = config.Load("rollup.yml")
// if err != nil {
// if os.IsNotExist(err) {
// return fmt.Errorf("rollup.yml file not found. Please create a configuration file or provide command-line arguments")
// }
// return fmt.Errorf("error loading configuration: %v", err)
// }
//
// // Use config if available, otherwise use command-line flags
// if len(urls) == 0 && cfg.Scrape.URL != "" {
// urls = []string{cfg.Scrape.URL}
// }
//
// if len(urls) == 0 {
// return fmt.Errorf("no URLs provided. Use --urls flag with comma-separated URLs or set 'scrape.url' in the rollup.yml file")
// }
//
// if outputFile == "" {
// outputFile = generateDefaultFilename(urls)
// }
//
// file, err := os.Create(outputFile)
// if err != nil {
// return fmt.Errorf("error creating output file: %v", err)
// }
// defer file.Close()
//
// for i, u := range urls {
// extractedContent, err := extractAndConvertContent(u)
// if err != nil {
// return fmt.Errorf("error extracting and converting content from %s: %v", u, err)
// }
//
// if i > 0 {
// _, err = file.WriteString("\n\n---\n\n")
// if err != nil {
// return fmt.Errorf("error writing separator to file: %v", err)
// }
// }
//
// _, err = file.WriteString(extractedContent)
// if err != nil {
// return fmt.Errorf("error writing content to file: %v", err)
// }
// }
//
// fmt.Printf("Content has been extracted from %d URL(s) and saved to %s\n", len(urls), outputFile)
// return nil
// }
//
// func generateDefaultFilename(urls []string) string {
// var hostnames []string
// for _, u := range urls {
// parsedURL, err := url.Parse(u)
// if err == nil {
// hostnames = append(hostnames, parsedURL.Hostname())
// }
// }
//
// var baseFilename string
// if len(hostnames) == 1 {
// baseFilename = hostnames[0]
// } else if len(hostnames) == 2 {
// baseFilename = fmt.Sprintf("%s-and-%s", hostnames[0], hostnames[1])
// } else if len(hostnames) > 2 {
// baseFilename = fmt.Sprintf("%s-and-%d-others", hostnames[0], len(hostnames)-1)
// } else {
// baseFilename = "web-content"
// }
//
// baseFilename = strings.NewReplacer(
// ".com", "",
// ".org", "",
// ".net", "",
// ".edu", "",
// ".", "-",
// ).Replace(baseFilename)
//
// if len(baseFilename) > 50 {
// baseFilename = baseFilename[:50]
// }
//
// timestamp := time.Now().Format("20060102-150405")
// return fmt.Sprintf("%s-%s.md", baseFilename, timestamp)
// }
//
// func extractAndConvertContent(urlStr string) (string, error) {
// content, err := scraper.FetchWebpageContent(urlStr)
// if err != nil {
// return "", fmt.Errorf("error fetching webpage content: %v", err)
// }
//
// // Use the CSS locator from the config
// cssLocator := cfg.Scrape.CSSLocator
// if cssLocator != "" {
// // TODO: Implement content extraction with CSS selector
// // content, err = scraper.ExtractContentWithCSS(content, cssLocator)
// // if err != nil {
// // return "", fmt.Errorf("error extracting content with CSS selector: %v", err)
// // }
// }
//
// // Create a new converter
// converter := md.NewConverter("", true, nil)
//
// // Convert HTML to Markdown
// markdown, err := converter.ConvertString(content)
// if err != nil {
// return "", fmt.Errorf("error converting HTML to Markdown: %v", err)
// }
//
// parsedURL, err := url.Parse(urlStr)
// if err != nil {
// return "", fmt.Errorf("error parsing URL: %v", err)
// }
// header := fmt.Sprintf("# Content from %s\n\n", parsedURL.String())
//
// return header + markdown + "\n\n", nil
// }

16
go.mod
View File

@@ -2,12 +2,24 @@ module github.com/tnypxl/rollup
go 1.23
require github.com/spf13/cobra v1.8.1
require (
github.com/anthropics/anthropic-sdk-go v0.5.0
github.com/spf13/cobra v1.8.1
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/deckarep/golang-set/v2 v2.6.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
github.com/go-stack/stack v1.8.1 // indirect
github.com/kr/text v0.1.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect
golang.org/x/net v0.27.0 // indirect
)
require (
github.com/PuerkitoBio/goquery v1.9.2
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/playwright-community/playwright-go v0.4501.1
github.com/russross/blackfriday/v2 v2.1.0
github.com/spf13/pflag v1.0.5 // indirect
gopkg.in/yaml.v2 v2.4.0
)

85
go.sum
View File

@@ -1,10 +1,95 @@
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/deckarep/golang-set/v2 v2.6.0 h1:XfcQbWM1LlMB8BsJ8N9vW5ehnnPVIw0je80NsVHagjM=
github.com/deckarep/golang-set/v2 v2.6.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4=
github.com/go-jose/go-jose/v3 v3.0.3 h1:fFKWeig/irsp7XD2zBxvnmA/XaRWp5V3CBsZXJF7G7k=
github.com/go-jose/go-jose/v3 v3.0.3/go.mod h1:5b+7YgP7ZICgJDBdfjZaIt+H/9L9T/YQrVfLAMboGkQ=
github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw=
github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc=
github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg=
github.com/playwright-community/playwright-go v0.4501.1 h1:kz8SIfR6nEI8blk77nTVD0K5/i37QP5rY/o8a1fG+4c=
github.com/playwright-community/playwright-go v0.4501.1/go.mod h1:bpArn5TqNzmP0jroCgw4poSOG9gSeQg490iLqWAaa7w=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 h1:vr/HnozRka3pE4EsMEg1lgkXJkTFJCVUX+S/ZT6wYzM=
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842/go.mod h1:XtvwrStGgqGPLc4cjQfWqZHG1YFdYs6swckp8vpsjnc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

45
internal/config/config.go Normal file
View File

@@ -0,0 +1,45 @@
package config
import (
"fmt"
"os"
"gopkg.in/yaml.v2"
)
type Config struct {
FileTypes []string `yaml:"file_types"`
Ignore []string `yaml:"ignore"`
CodeGenerated []string `yaml:"code_generated"`
Scrape ScrapeConfig `yaml:"scrape"`
}
type ScrapeConfig struct {
URL string `yaml:"url"`
CSSLocator string `yaml:"css_locator"`
}
func Load(configPath string) (*Config, error) {
data, err := os.ReadFile(configPath)
if err != nil {
return nil, fmt.Errorf("error reading config file: %v", err)
}
var config Config
err = yaml.Unmarshal(data, &config)
if err != nil {
return nil, fmt.Errorf("error parsing config file: %v", err)
}
return &config, nil
}
func DefaultConfigPath() string {
return "rollup.yml"
}
func FileExists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}

209
internal/scraper/scraper.go Normal file
View File

@@ -0,0 +1,209 @@
package scraper
import (
"fmt"
"log"
"math/rand"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/playwright-community/playwright-go"
"github.com/russross/blackfriday/v2"
)
var (
pw *playwright.Playwright
browser playwright.Browser
)
// Config holds the scraper configuration
type Config struct {
CSSLocator string
}
// InitPlaywright initializes Playwright and launches the browser
func InitPlaywright() error {
log.Println("Initializing Playwright")
var err error
pw, err = playwright.Run()
if err != nil {
return fmt.Errorf("could not start Playwright: %v", err)
}
userAgent := "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
browser, err = pw.Chromium.Launch(playwright.BrowserTypeLaunchOptions{
Args: []string{fmt.Sprintf("--user-agent=%s", userAgent)},
})
if err != nil {
return fmt.Errorf("could not launch browser: %v", err)
}
log.Println("Playwright initialized successfully")
return nil
}
// ClosePlaywright closes the browser and stops Playwright
func ClosePlaywright() {
if browser != nil {
browser.Close()
}
if pw != nil {
pw.Stop()
}
}
// FetchWebpageContent retrieves the content of a webpage using Playwright
func FetchWebpageContent(urlStr string) (string, error) {
log.Printf("Fetching webpage content for URL: %s\n", urlStr)
page, err := browser.NewPage()
if err != nil {
log.Printf("Error creating new page: %v\n", err)
return "", fmt.Errorf("could not create page: %v", err)
}
defer page.Close()
err = page.EmulateMedia(playwright.PageEmulateMediaOptions{
Media: playwright.MediaPrint,
})
if err != nil {
log.Printf("Error emulating print media: %v\n", err)
return "", fmt.Errorf("could not emulate print media: %v", err)
}
time.Sleep(time.Duration(rand.Intn(2000)+1000) * time.Millisecond)
log.Printf("Navigating to URL: %s\n", urlStr)
if _, err = page.Goto(urlStr, playwright.PageGotoOptions{
WaitUntil: playwright.WaitUntilStateNetworkidle,
}); err != nil {
log.Printf("Error navigating to page: %v\n", err)
return "", fmt.Errorf("could not go to page: %v", err)
}
log.Println("Waiting for page load state")
err = page.WaitForLoadState(playwright.PageWaitForLoadStateOptions{
State: playwright.LoadStateNetworkidle,
})
if err != nil {
log.Printf("Error waiting for page load: %v\n", err)
return "", fmt.Errorf("error waiting for page load: %v", err)
}
log.Println("Scrolling page")
err = scrollPage(page)
if err != nil {
log.Printf("Error scrolling page: %v\n", err)
return "", fmt.Errorf("error scrolling page: %v", err)
}
log.Println("Waiting for body element")
_, err = page.WaitForSelector("body", playwright.PageWaitForSelectorOptions{
State: playwright.WaitForSelectorStateVisible,
})
if err != nil {
log.Printf("Error waiting for body: %v\n", err)
return "", fmt.Errorf("error waiting for body: %v", err)
}
log.Println("Getting page content")
content, err := page.Content()
if err != nil {
log.Printf("Error getting page content: %v\n", err)
return "", fmt.Errorf("could not get page content: %v", err)
}
log.Printf("Successfully fetched webpage content (length: %d)\n", len(content))
return content, nil
}
// ProcessHTMLContent converts HTML content to Markdown
func ProcessHTMLContent(htmlContent string, config Config) (string, error) {
log.Printf("Processing HTML content (length: %d)\n", len(htmlContent))
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
log.Printf("Error parsing HTML: %v\n", err)
return "", fmt.Errorf("error parsing HTML: %v", err)
}
var content string
if config.CSSLocator != "" {
log.Printf("Using CSS locator: %s\n", config.CSSLocator)
content, err = doc.Find(config.CSSLocator).Html()
if err != nil {
log.Printf("Error extracting content with CSS locator: %v\n", err)
return "", fmt.Errorf("error extracting content with CSS locator: %v", err)
}
} else {
log.Println("No CSS locator provided, processing entire body")
content, err = doc.Find("body").Html()
if err != nil {
log.Printf("Error extracting body content: %v\n", err)
return "", fmt.Errorf("error extracting body content: %v", err)
}
}
markdown := convertToMarkdown(content)
log.Printf("Converted HTML to Markdown (length: %d)\n", len(markdown))
return markdown, nil
}
func convertToMarkdown(html string) string {
// Use a simple HTML-to-Markdown conversion
markdown := blackfriday.Run([]byte(html),
blackfriday.WithExtensions(blackfriday.CommonExtensions|blackfriday.HardLineBreak))
return string(markdown)
}
func scrollPage(page playwright.Page) error {
log.Println("Starting page scroll")
script := `
() => {
window.scrollTo(0, document.body.scrollHeight);
return document.body.scrollHeight;
}
`
previousHeight := 0
for i := 0; i < 250; i++ {
height, err := page.Evaluate(script)
if err != nil {
log.Printf("Error scrolling (iteration %d): %v\n", i+1, err)
return fmt.Errorf("error scrolling: %v", err)
}
var currentHeight int
switch v := height.(type) {
case int:
currentHeight = v
case float64:
currentHeight = int(v)
default:
log.Printf("Unexpected height type: %T\n", height)
return fmt.Errorf("unexpected height type: %T", height)
}
log.Printf("Scroll iteration %d: height = %d\n", i+1, currentHeight)
if currentHeight == previousHeight {
log.Println("Reached bottom of the page")
break
}
previousHeight = currentHeight
page.WaitForTimeout(500)
}
log.Println("Scrolling back to top")
_, err := page.Evaluate(`() => { window.scrollTo(0, 0); }`)
if err != nil {
log.Printf("Error scrolling back to top: %v\n", err)
return fmt.Errorf("error scrolling back to top: %v", err)
}
log.Println("Page scroll completed")
return nil
}

18
main.go
View File

@@ -2,12 +2,30 @@ package main
import (
"fmt"
"log"
"os"
"github.com/tnypxl/rollup/cmd"
"github.com/tnypxl/rollup/internal/config"
"github.com/tnypxl/rollup/internal/scraper"
)
var cfg *config.Config
func main() {
configPath := config.DefaultConfigPath()
var err error
cfg, err = config.Load(configPath)
if err != nil {
log.Fatalf("Failed to load configuration: %v", err)
}
err = scraper.InitPlaywright()
if err != nil {
log.Fatalf("Failed to initialize Playwright: %v", err)
}
defer scraper.ClosePlaywright()
if err := cmd.Execute(); err != nil {
fmt.Println(err)
os.Exit(1)