Documentation
¶
Index ¶
- func GetAttrName(selector string) string
- func GetBaseURL(fullURL string) string
- func GetCurrentURL(fullURL string) string
- func GetFloat(htmlText, selector string) (float64, error)
- func GetFullURL(baseURL, relativePath string) string
- func GetInt(htmlText, selector string) (int, error)
- func GetOuterHTML(htmlText, selector string) ([]string, error)
- func GetText(htmlText, selector string) ([]string, error)
- func GetTextSingle(htmlText, selector string) (string, error)
- func GetTime(htmlText, selector, format string) (*time.Time, error)
- type ExtractionFunc
- type Options
- type PaginationConfig
- type Result
- type Scraper
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GetAttrName ¶
GetAttrName extracts the attribute name from a CSS selector with attribute selector Returns the attribute name if the selector ends with an attribute selector, empty string otherwise Examples: "div[data-id]" -> "data-id", "input[type='text']" -> "type", "a[href]" -> "href"
func GetBaseURL ¶
func GetCurrentURL ¶ added in v0.1.18
GetCurrentURL extracts just the path from a full URL, removing the query parameters and fragments
func GetFloat ¶
GetFloat extracts text from the first element matching the selector and converts it to float64 Returns 0.0 if no match found or conversion fails
func GetFullURL ¶
func GetInt ¶
GetInt extracts text from the first element matching the selector and converts it to int Returns 0 if no match found or conversion fails
func GetOuterHTML ¶
GetOuterHTML extracts the outer HTML of elements matching the given CSS selector from HTML text Returns a slice of outer HTML strings for all matching elements
func GetText ¶
GetText extracts the text content of elements matching the given CSS selector from HTML text Returns a slice of text strings for all matching elements
func GetTextSingle ¶
GetTextSingle extracts the text content of the first element matching the given CSS selector Returns empty string if no match found
Types ¶
type ExtractionFunc ¶
type Options ¶
type Options struct {
// UserAgent to use for requests
UserAgent string
// AllowedDomains restricts scraping to specific domains
AllowedDomains []string
// MaxDepth limits how deep the scraper will follow links
MaxDepth int
// Async enables asynchronous scraping
Async bool
// MaxParallelRequests sets the maximum number of parallel requests
MaxParallelRequests int
// MaxRetries specifies the maximum number of retries for requests
MaxRetries int
// UseCloudflareBypass enables Cloudflare bypass using proper TLS and headers
// Helps avoid triggering Cloudflare challenges in the first place
UseCloudflareBypass bool
// Logger allows custom logging in debug (optional)
Logger *zap.Logger
}
Options provides configuration for the Scraper
type PaginationConfig ¶
type PaginationConfig struct {
// NextPageSelector is the CSS selector for the "next page" link
// if the selector matches no elements, pagination stops
NextPageSelector string
// LastPageSelector is the CSS selector that indicates the last page number
// pagination is done with incrementing page numbers until this selector value
// using NextPageURLPattern to construct URLs
LastPageSelector string
// NextPageURLPattern is an optional pattern to construct the next page URL by
// replacing a '::page::' with the page number.
// This is mandatory if LastPageSelector is used
NextPageURLPattern string
}
PaginationConfig holds configuration for paginated scraping
type Scraper ¶
type Scraper struct {
// contains filtered or unexported fields
}
Scraper represents an HTML scraper with configurable options
func NewDefault ¶
func NewDefault() *Scraper
NewDefault creates a new Scraper instance with default options
func (*Scraper) ScrapeHTML ¶
ScrapeHTML fetches and returns the complete HTML content for a given URL Implements exponential backoff retry for 429 (Too Many Requests) status codes Detects bot challenges and uses rod to solve CAPTCHAs and obtain cookies
func (*Scraper) ScrapeOuterHTML ¶
ScrapeOuterHTML fetches the outer HTML of elements matching the given CSS selector
func (*Scraper) ScrapePaginated ¶
func (s *Scraper) ScrapePaginated(url, selector string, config PaginationConfig) (<-chan Result, error)
ScrapePaginated scrapes outer HTML of elements matching the selector across multiple pages Returns a read-only channel that streams results as they are scraped, and an error channel for errors