inital commit
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*sites.db*
|
6
go.mod
Normal file
6
go.mod
Normal file
@@ -0,0 +1,6 @@
|
||||
module squibid/scraper
|
||||
|
||||
go 1.24.5
|
||||
|
||||
require golang.org/x/net v0.43.0
|
||||
require github.com/mattn/go-sqlite3 v1.14.32 // indirect
|
4
go.sum
Normal file
4
go.sum
Normal file
@@ -0,0 +1,4 @@
|
||||
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
|
||||
github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||
golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
|
||||
golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
|
105
main.go
Normal file
105
main.go
Normal file
@@ -0,0 +1,105 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var db *sql.DB
|
||||
|
||||
func db_insert_url(url string, seen bool) {
|
||||
db.Exec(`insert into urls values (?, ?)`, url, seen)
|
||||
}
|
||||
|
||||
func deal_html(site string, reader io.Reader) {
|
||||
doc, err := html.Parse(reader)
|
||||
if err != nil {
|
||||
fmt.Println("Error parsing HTML:", err)
|
||||
return
|
||||
}
|
||||
|
||||
var insert_url string
|
||||
var walk func(n *html.Node)
|
||||
walk = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && n.Data == "a" {
|
||||
for _, attr := range n.Attr {
|
||||
if attr.Key == "href" && len(attr.Val) > 1 {
|
||||
if attr.Val[:2] == "//" {
|
||||
insert_url = "https:" + attr.Val
|
||||
} else if attr.Val[:1] == "/" {
|
||||
if site[len(site) - 1:] == "/" {
|
||||
insert_url = site + attr.Val[1:]
|
||||
} else {
|
||||
insert_url = site + attr.Val
|
||||
}
|
||||
} else if len(attr.Val) > 4 && attr.Val[:4] == "http" {
|
||||
insert_url = attr.Val
|
||||
}
|
||||
db_insert_url(insert_url, site == insert_url)
|
||||
}
|
||||
}
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
walk(c)
|
||||
}
|
||||
}
|
||||
walk(doc)
|
||||
}
|
||||
|
||||
func main() {
|
||||
var err any
|
||||
|
||||
site := "https://squi.bid/"
|
||||
db, err = sql.Open("sqlite3", "./sites.db")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
db.Exec(`
|
||||
create table if not exists
|
||||
urls (url text not null primary key, indexed boolean not null);
|
||||
`)
|
||||
|
||||
/* start fetching the sites */
|
||||
for i := 0;; i++ {
|
||||
if i > 0 {
|
||||
rows, err := db.Query(`select url from urls where indexed is false`)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
for rows.Next() {
|
||||
var test string
|
||||
rows.Scan(&test)
|
||||
site = test
|
||||
/* we can't just check if the site is the same because then when we're
|
||||
* checking squi.bid/example it won't register squi.bid as the same
|
||||
* domain, although maybe that's what we want.
|
||||
*/
|
||||
if !strings.Contains(test, site) {
|
||||
break
|
||||
}
|
||||
}
|
||||
rows.Close()
|
||||
}
|
||||
|
||||
fmt.Println("fetching " + site)
|
||||
resp, err := http.Get(site)
|
||||
if err != nil {
|
||||
fmt.Println("Error getting", site)
|
||||
continue
|
||||
}
|
||||
|
||||
deal_html(site, resp.Body)
|
||||
db.Exec(`update urls set indexed = true where url == ?`, site)
|
||||
|
||||
resp.Body.Close()
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user