Skip to content

Commit

Permalink
Merge branch 'release/v0.3.0-alpha'
Browse files Browse the repository at this point in the history
  • Loading branch information
andreaskoch committed May 2, 2020
2 parents 7f4882e + a1625fa commit 2c56094
Show file tree
Hide file tree
Showing 30 changed files with 223 additions and 858 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/.idea
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/)
and this project adheres to [Semantic Versioning](http://semver.org/).

## [Unreleased]

### Added
- Add "Save downloaded data to disk" to the roadmap (feature request #1)

## [v0.2.0-alpha] - 2017-02-07

### Fixed
Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,20 @@ gargantua crawl --url https://www.sitemaps.org/sitemap.xml --workers 5

see also: [A short introduction video of gargantua on YouTube](https://www.youtube.com/watch?v=TSCMvUvc0qo)

### Customize the user-agent

You can specify a customized user agent using the `--user-agent` argument:

```bash
gargantua crawl --url https://www.sitemaps.org/sitemap.xml --workers 5 --user-agent "gargantua bot / iPhone"
```

## Download

You can download binaries for Linux, macOS and Windows from [github.com »andreaskoch » gargantua » releases](https://github.com/andreaskoch/gargantua/releases):

```bash
wget https://github.com/andreaskoch/gargantua/releases/download/v0.2.0-alpha/gargantua_linux_amd64
wget https://github.com/andreaskoch/gargantua/releases/download/v0.3.0-alpha/gargantua_linux_amd64
```

## Docker Image
Expand All @@ -52,10 +60,10 @@ docker run --rm andreaskoch/gargantua:latest \
## Roadmap

- Increase the number of workers at runtime
- Personalized user agent string
- Silent mode (only show statistics at the end)
- CSV mode (print CSV output to stdout)
- Web-UI
- Save downloaded data to disk

## License

Expand Down
9 changes: 5 additions & 4 deletions crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ import (
type CrawlOptions struct {
NumberOfConcurrentRequests int
Timeout time.Duration
UserAgent string
}

func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {

// read the XML sitemap as a initial source for URLs
urlsFromXMLSitemap, err := getURLs(xmlSitemapURL)
urlsFromXMLSitemap, err := getURLs(xmlSitemapURL, "gargantua bot")
if err != nil {
return err
}
Expand Down Expand Up @@ -59,7 +60,7 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
go func() {
workerID := <-workers
debugf("Using worker %d for URL %q", workerID, targetURL.String())
results <- executeWork(workerID, cap(workers), targetURL, urls)
results <- executeWork(workerID, cap(workers), targetURL, options.UserAgent, urls)
debugf("Worker %d finished processing URL %q", workerID, targetURL.String())
workers <- workerID
}()
Expand All @@ -85,8 +86,8 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
return

case result := <-results:
url := result.URL()
debugf("Received results for URL %q", url.String())
receivedUrl := result.URL()
debugf("Received results for URL %q", receivedUrl.String())
updateStatistics(result)
}
}
Expand Down
17 changes: 17 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
module github.com/andreaskoch/gargantua

go 1.14

require (
github.com/PuerkitoBio/goquery v1.0.2
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc // indirect
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect
github.com/andybalholm/cascadia v0.0.0-20161224141413-349dd0209470 // indirect
github.com/gizak/termui v2.2.1-0.20170117222342-991cd3d38091+incompatible
github.com/maruel/panicparse v0.0.0-20160720141634-ad661195ed0e // indirect
github.com/mattn/go-runewidth v0.0.2-0.20161012013512-737072b4e32b // indirect
github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7 // indirect
github.com/nsf/termbox-go v0.0.0-20161205194251-abe82ce5fb7a // indirect
golang.org/x/net v0.0.0-20161101191631-4bb47a1098b3 // indirect
gopkg.in/alecthomas/kingpin.v2 v2.2.3
)
22 changes: 22 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
github.com/PuerkitoBio/goquery v1.0.2 h1:6eVgli+CgrpInQgyW5Unj3aqfzqFk/ALcKm6m0w7hgA=
github.com/PuerkitoBio/goquery v1.0.2/go.mod h1:T9ezsOHcCrDCgA8aF1Cqr3sSYbO/xgdy8/R/XiIMAhA=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5VpdgMhJosfJnn5/FoN2SRZ4p7fJNX58YPaU=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf h1:qet1QNfXsQxTZqLG4oE62mJzwPIB8+Tee4RNCL9ulrY=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/andybalholm/cascadia v0.0.0-20161224141413-349dd0209470 h1:4jHLmof+Hba81591gfH5xYA8QXzuvgksxwPNrmjR2BA=
github.com/andybalholm/cascadia v0.0.0-20161224141413-349dd0209470/go.mod h1:3I+3V7B6gTBYfdpYgIG2ymALS9H+5VDKUl3lHH7ToM4=
github.com/gizak/termui v2.2.1-0.20170117222342-991cd3d38091+incompatible h1:opetNB+OO9qymCnrSBGZPPKuQMMYBcyrzEYiOB+RrHM=
github.com/gizak/termui v2.2.1-0.20170117222342-991cd3d38091+incompatible/go.mod h1:PkJoWUt/zacQKysNfQtcw1RW+eK2SxkieVBtl+4ovLA=
github.com/maruel/panicparse v0.0.0-20160720141634-ad661195ed0e h1:e2z/lz9pvtRrEOgKWaLW2Dw02Nqd3/fqv0qWTQ8ByZE=
github.com/maruel/panicparse v0.0.0-20160720141634-ad661195ed0e/go.mod h1:nty42YY5QByNC5MM7q/nj938VbgPU7avs45z6NClpxI=
github.com/mattn/go-runewidth v0.0.2-0.20161012013512-737072b4e32b h1:zGKCShADxSzhD4RVcNFKuaedhqMYyWD54Fg9aV/BvXM=
github.com/mattn/go-runewidth v0.0.2-0.20161012013512-737072b4e32b/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7 h1:DpOJ2HYzCv8LZP15IdmG+YdwD2luVPHITV96TkirNBM=
github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo=
github.com/nsf/termbox-go v0.0.0-20161205194251-abe82ce5fb7a h1:JbDkPy70t0IWlnEvNb5TsmOOpKp/0UkPk2FMek2mOGM=
github.com/nsf/termbox-go v0.0.0-20161205194251-abe82ce5fb7a/go.mod h1:IuKpRQcYE1Tfu+oAQqaLisqDeXgjyyltCfsaoYN18NQ=
golang.org/x/net v0.0.0-20161101191631-4bb47a1098b3 h1:9FrZULpPblLeSMxFmRapLbJGYHjcvaCZYD+5rwKQqZA=
golang.org/x/net v0.0.0-20161101191631-4bb47a1098b3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
gopkg.in/alecthomas/kingpin.v2 v2.2.3 h1:/L3oK40poPRwke0Ipa6qqf8n+awu60Vl3DMe+3jLDt4=
gopkg.in/alecthomas/kingpin.v2 v2.2.3/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=
29 changes: 19 additions & 10 deletions http.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,18 @@ func (response *Response) IsHTML() bool {
return strings.HasPrefix(response.contentType, "text/html")
}

func readURL(url url.URL) (Response, error) {
func readURL(url url.URL, userAgent string) (Response, error) {
startTime := time.Now().UTC()
resp, fetchErr := http.Get(url.String())

req, requestErr := http.NewRequest("GET", url.String(), nil)
if requestErr != nil {
return Response{}, requestErr
}

req.Header.Set("User-Agent", userAgent)

client := &http.Client{}
resp, fetchErr := client.Do(req)
if fetchErr != nil {
return Response{}, fetchErr
}
Expand Down Expand Up @@ -78,16 +87,16 @@ func readURL(url url.URL) (Response, error) {
}, nil
}

func getURLs(xmlSitemapURL url.URL) ([]url.URL, error) {
func getURLs(xmlSitemapURL url.URL, userAgent string) ([]url.URL, error) {

var urls []url.URL

urlsFromIndex, indexError := getURLsFromSitemapIndex(xmlSitemapURL)
urlsFromIndex, indexError := getURLsFromSitemapIndex(xmlSitemapURL, userAgent)
if indexError == nil {
urls = urlsFromIndex
}

urlsFromSitemap, sitemapError := getURLsFromSitemap(xmlSitemapURL)
urlsFromSitemap, sitemapError := getURLsFromSitemap(xmlSitemapURL, userAgent)
if sitemapError == nil {
urls = append(urls, urlsFromSitemap...)
}
Expand All @@ -100,11 +109,11 @@ func getURLs(xmlSitemapURL url.URL) ([]url.URL, error) {

}

func getURLsFromSitemap(xmlSitemapURL url.URL) ([]url.URL, error) {
func getURLsFromSitemap(xmlSitemapURL url.URL, userAgent string) ([]url.URL, error) {

var urls []url.URL

sitemap, xmlSitemapError := getXMLSitemap(xmlSitemapURL)
sitemap, xmlSitemapError := getXMLSitemap(xmlSitemapURL, userAgent)
if xmlSitemapError != nil {
return nil, xmlSitemapError
}
Expand All @@ -122,11 +131,11 @@ func getURLsFromSitemap(xmlSitemapURL url.URL) ([]url.URL, error) {
return urls, nil
}

func getURLsFromSitemapIndex(xmlSitemapURL url.URL) ([]url.URL, error) {
func getURLsFromSitemapIndex(xmlSitemapURL url.URL, userAgent string) ([]url.URL, error) {

var urls []url.URL

sitemapIndex, sitemapIndexError := getSitemapIndex(xmlSitemapURL)
sitemapIndex, sitemapIndexError := getSitemapIndex(xmlSitemapURL, userAgent)
if sitemapIndexError != nil {
return nil, sitemapIndexError
}
Expand All @@ -138,7 +147,7 @@ func getURLsFromSitemapIndex(xmlSitemapURL url.URL) ([]url.URL, error) {
return nil, err
}

sitemapUrls, err := getURLsFromSitemap(*locationURL)
sitemapUrls, err := getURLsFromSitemap(*locationURL, userAgent)
if err != nil {
return nil, err
}
Expand Down
10 changes: 7 additions & 3 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ import (
)

const applicationName = "gargantua"
const applicationVersion = "v0.2.0-alpha"
const applicationVersion = "v0.3.0-alpha"

var defaultUserAgent = fmt.Sprintf("%s bot (https://github.com/andreaskoch/gargantua)", applicationName)

var (
app = kingpin.New(applicationName, fmt.Sprintf(`「 %s 」%s crawls all URLs of your website - starting with the links in your sitemap.xml
Expand All @@ -27,6 +29,7 @@ var (
crawlCommand = app.Command("crawl", "Crawls a given websites' XML sitemap")
crawlWebsiteURL = crawlCommand.Flag("url", "The URL to a websites' XML sitemap (e.g. https://www.sitemaps.org/sitemap.xml)").Required().Envar("GARGANTUA_URL").Short('u').String()
crawlWorkers = crawlCommand.Flag("workers", "The number of concurrent workers that crawl the site at the same time").Required().Envar("GARGANTUA_WORKERS").Short('w').Int()
userAgent = crawlCommand.Flag("user-agent", "The user agent that shall be used for all requests").Default(defaultUserAgent).Envar("GARGANTUA_USER_AGENT").Short('a').String()
)

func init() {
Expand All @@ -49,7 +52,7 @@ func handleCommandlineArgument(arguments []string) {
os.Exit(1)
}

err := startCrawling(*websiteURL, *crawlWorkers, *timeout, *verbose)
err := startCrawling(*websiteURL, *userAgent, *crawlWorkers, *timeout, *verbose)
if err != nil {
fmt.Fprintf(os.Stderr, "%s", err)
os.Exit(1)
Expand All @@ -60,7 +63,7 @@ func handleCommandlineArgument(arguments []string) {

}

func startCrawling(targetURL url.URL, concurrentRequests, timeoutInSeconds int, debugModeIsEnabled bool) error {
func startCrawling(targetURL url.URL, userAgent string, concurrentRequests, timeoutInSeconds int, debugModeIsEnabled bool) error {
stopTheCrawler := make(chan bool)
stopTheUI := make(chan bool)
crawlResult := make(chan error)
Expand All @@ -69,6 +72,7 @@ func startCrawling(targetURL url.URL, concurrentRequests, timeoutInSeconds int,
result := crawl(targetURL, CrawlOptions{
NumberOfConcurrentRequests: int(concurrentRequests),
Timeout: time.Second * time.Duration(timeoutInSeconds),
UserAgent: userAgent,
}, stopTheCrawler)

stopTheUI <- true
Expand Down
4 changes: 2 additions & 2 deletions sitemapindex.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ import (
"strings"
)

func getSitemapIndex(xmlSitemapURL url.URL) (SitemapIndex, error) {
response, readErr := readURL(xmlSitemapURL)
func getSitemapIndex(xmlSitemapURL url.URL, userAgent string) (SitemapIndex, error) {
response, readErr := readURL(xmlSitemapURL, userAgent)
if readErr != nil {
return SitemapIndex{}, readErr
}
Expand Down
4 changes: 2 additions & 2 deletions sitemapindex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func Test_getSitemapIndex_NoIndexGiven_ErrorIsReturned(t *testing.T) {
defer testSitemapServer.Close()

testServerURL, _ := url.Parse(testSitemapServer.URL)
_, err := getSitemapIndex(*testServerURL)
_, err := getSitemapIndex(*testServerURL, "gargantua bot")

if err == nil {
t.Fail()
Expand All @@ -47,7 +47,7 @@ func Test_getSitemapIndex_IndexExists_IndexIsNotEmpty(t *testing.T) {
defer testSitemapServer.Close()

testServerURL, _ := url.Parse(testSitemapServer.URL)
sitemapIndex, err := getSitemapIndex(*testServerURL)
sitemapIndex, err := getSitemapIndex(*testServerURL, "gargantua bot")

if err != nil {
t.Fail()
Expand Down
1 change: 1 addition & 0 deletions vendor/github.com/PuerkitoBio/goquery/.gitattributes

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions vendor/github.com/PuerkitoBio/goquery/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions vendor/github.com/PuerkitoBio/goquery/.travis.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions vendor/github.com/andybalholm/cascadia/.travis.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Empty file modified vendor/github.com/andybalholm/cascadia/LICENSE
100755 → 100644
Empty file.
26 changes: 26 additions & 0 deletions vendor/github.com/gizak/termui/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions vendor/github.com/gizak/termui/.travis.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 2c56094

Please sign in to comment.