Documentation
¶
Index ¶
- Constants
- type AppState
- type Connector
- type Crawler
- type EdgesSet
- type RMEntry
- type Record
- type RecordManager
- func (rm *RecordManager) AddEdge(fromURL string, toURL string) error
- func (rm *RecordManager) AddRecord(entry RMEntry)
- func (rm *RecordManager) Count() int
- func (rm *RecordManager) Dump() map[string]Record
- func (rm *RecordManager) Exists(rawURL string) bool
- func (rm *RecordManager) Get(rawURL string) (Record, bool)
- func (rm *RecordManager) LoadFromReader(r io.Reader) error
- func (rm *RecordManager) SaveToWriter(w io.Writer, indent bool) error
- func (rm *RecordManager) Update(rawURL string, statusCode int, err error) error
- type Result
- type StatsCLIOutWriter
- func (sm *StatsCLIOutWriter) AddErrorEntry(value string)
- func (sm *StatsCLIOutWriter) AddLatencySample(value time.Duration)
- func (sm *StatsCLIOutWriter) IncDecDepth(value int)
- func (sm *StatsCLIOutWriter) IncDecErrorsCount(value int)
- func (sm *StatsCLIOutWriter) IncDecLinksCount(value int)
- func (sm *StatsCLIOutWriter) IncDecLinksInQueue(value int)
- func (sm *StatsCLIOutWriter) IncDecTotalRequestsCount(value int)
- func (sm *StatsCLIOutWriter) IncDecWorkersRunning(value int)
- func (sm *StatsCLIOutWriter) RunOutputFlusher()
- func (sm *StatsCLIOutWriter) SetAppState(state AppState)
- func (sm *StatsCLIOutWriter) SetDepth(value int)
- func (sm *StatsCLIOutWriter) SetErrorsCount(value int)
- func (sm *StatsCLIOutWriter) SetLinksCount(value int)
- func (sm *StatsCLIOutWriter) SetLinksInQueue(value int)
- func (sm *StatsCLIOutWriter) SetTotalRequestsCount(value int)
- func (sm *StatsCLIOutWriter) SetWorkersRunning(value int)
- type StatsManager
- type Task
- type URLEntity
- type WebClient
Constants ¶
const ( // AppState_Unknown represents the 'unknown' state. AppState_Unknown = iota // AppState_IDLE represents the 'idle' state. AppState_IDLE // AppState_Running represents the 'run' state. AppState_Running // AppState_Finished represents the 'finish' state. AppState_Finished )
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type AppState ¶
type AppState int
AppState represents the current state of the App.
type Connector ¶
type Connector interface {
GetLinks(rawURL string) (statusCode int, links []URLEntity, latency time.Duration, err error)
}
Connector describes the connector interface.
type Crawler ¶
type Crawler struct {
// Read-only vars
InitialURL string
Stats bool
ShowErrors bool
WorkersCount int
Depth int
StayInSubdomain bool
TreeMode bool
SubDomain string
Retry int
// contains filtered or unexported fields
}
Crawler brings everything together and is responsible for starting goroutines and manage them.
func NewCrawler ¶
func NewCrawler(connector Connector, initialURL string, retry int, linksWriter io.Writer, stats bool, showErrors bool, stayinsubdomain bool, treemode bool, workersCount int, depth int) (*Crawler, error)
NewCrawler returns a new Crawler.
func (*Crawler) Merger ¶
Merger gets the results from the workers (links) and keeps all the relevant information feeding the new links to workers via another channel.
func (*Crawler) StatsWriter ¶
StatsWriter writes stats to a io.Writer (e.g. os.Stdout)
type EdgesSet ¶
type EdgesSet map[int]struct{}
func NewEdgesSet ¶
func NewEdgesSet() EdgesSet
func (EdgesSet) MarshalJSON ¶
func (*EdgesSet) UnmarshalJSON ¶
type Record ¶
type Record struct {
// Index allows easy referencing of records (used in the edges)
Index int `json:"index"`
// This indicates whether this is the start of the graph
// i.e., URL provided.
InitPoint bool `json:"initPoint"`
URL string `json:"url"`
Host string `json:"host"`
Depth int `json:"depth"`
// Edges []uint `json:"edges"`
// This is supposed to be mimicing a hashset
// We use a struct as a value as it's a bit more space efficient
Edges EdgesSet `json:"edges"`
StatusCode int `json:"statusCode"`
ErrString string `json:"errString,omitempty"`
}
Record represents an entry in the RecordManager (internal state).
type RecordManager ¶
type RecordManager struct {
// Keeps a table of Records. Key is the URL (scheme,authority,path,query)
Records map[string]Record
IndexCount int
}
RecordManager keeps track of links visited and some metadata like depth level and its children.
func NewRecordManager ¶
func NewRecordManager() *RecordManager
NewRecordManager returns a new Record Manager.
func (*RecordManager) AddEdge ¶
func (rm *RecordManager) AddEdge(fromURL string, toURL string) error
AddEdge adds a new edge to a record if not already present.
func (*RecordManager) AddRecord ¶
func (rm *RecordManager) AddRecord(entry RMEntry)
AddRecord adds a record to the RecordManager.
func (*RecordManager) Count ¶
func (rm *RecordManager) Count() int
Count counts the number of records.
func (*RecordManager) Dump ¶
func (rm *RecordManager) Dump() map[string]Record
Dump returns all records in the RecordManager.
func (*RecordManager) Exists ¶
func (rm *RecordManager) Exists(rawURL string) bool
Exists checks whether this URL exists in the table.
func (*RecordManager) Get ¶
func (rm *RecordManager) Get(rawURL string) (Record, bool)
Get returns a record from the Record Manager.
func (*RecordManager) LoadFromReader ¶
func (rm *RecordManager) LoadFromReader(r io.Reader) error
LoadFromReader reads the records from a Reader in JSON format. Can pass a os.File, to read from a file.
func (*RecordManager) SaveToWriter ¶
func (rm *RecordManager) SaveToWriter(w io.Writer, indent bool) error
SaveToWriter dumps the records map into a Writer in JSON format. Can pass a os.File, to write to a file.
type Result ¶
type Result struct {
ParentURL string
StatusCode int
Links []URLEntity
// Depth of the ParentURL
Depth int
Err error
}
Result is what workers return in a channel.
type StatsCLIOutWriter ¶
type StatsCLIOutWriter struct {
// contains filtered or unexported fields
}
StatsCLIOutWriter keeps track of stats and writes to a writer up to date stats.
func NewStatsCLIOutWriter ¶
func NewStatsCLIOutWriter(writer io.Writer, showErrors bool, totalWorkersCount int, depth int) *StatsCLIOutWriter
NewStatsCLIOutWriter returns a new StatsCLIOutWriter.
func (*StatsCLIOutWriter) AddErrorEntry ¶
func (sm *StatsCLIOutWriter) AddErrorEntry(value string)
func (*StatsCLIOutWriter) AddLatencySample ¶
func (sm *StatsCLIOutWriter) AddLatencySample(value time.Duration)
func (*StatsCLIOutWriter) IncDecDepth ¶
func (sm *StatsCLIOutWriter) IncDecDepth(value int)
func (*StatsCLIOutWriter) IncDecErrorsCount ¶
func (sm *StatsCLIOutWriter) IncDecErrorsCount(value int)
func (*StatsCLIOutWriter) IncDecLinksCount ¶
func (sm *StatsCLIOutWriter) IncDecLinksCount(value int)
func (*StatsCLIOutWriter) IncDecLinksInQueue ¶
func (sm *StatsCLIOutWriter) IncDecLinksInQueue(value int)
func (*StatsCLIOutWriter) IncDecTotalRequestsCount ¶
func (sm *StatsCLIOutWriter) IncDecTotalRequestsCount(value int)
func (*StatsCLIOutWriter) IncDecWorkersRunning ¶
func (sm *StatsCLIOutWriter) IncDecWorkersRunning(value int)
func (*StatsCLIOutWriter) RunOutputFlusher ¶
func (sm *StatsCLIOutWriter) RunOutputFlusher()
This functions writes the updated stats to an io.Writer Run this in a goroutine
func (*StatsCLIOutWriter) SetAppState ¶
func (sm *StatsCLIOutWriter) SetAppState(state AppState)
func (*StatsCLIOutWriter) SetDepth ¶
func (sm *StatsCLIOutWriter) SetDepth(value int)
func (*StatsCLIOutWriter) SetErrorsCount ¶
func (sm *StatsCLIOutWriter) SetErrorsCount(value int)
func (*StatsCLIOutWriter) SetLinksCount ¶
func (sm *StatsCLIOutWriter) SetLinksCount(value int)
func (*StatsCLIOutWriter) SetLinksInQueue ¶
func (sm *StatsCLIOutWriter) SetLinksInQueue(value int)
func (*StatsCLIOutWriter) SetTotalRequestsCount ¶
func (sm *StatsCLIOutWriter) SetTotalRequestsCount(value int)
func (*StatsCLIOutWriter) SetWorkersRunning ¶
func (sm *StatsCLIOutWriter) SetWorkersRunning(value int)
type StatsManager ¶
type StatsManager interface {
SetAppState(state AppState)
SetLinksInQueue(value int)
IncDecLinksInQueue(value int)
SetLinksCount(value int)
IncDecLinksCount(value int)
SetErrorsCount(value int)
IncDecErrorsCount(value int)
SetWorkersRunning(value int)
IncDecWorkersRunning(value int)
SetTotalRequestsCount(value int)
IncDecTotalRequestsCount(value int)
SetDepth(value int)
IncDecDepth(value int)
AddLatencySample(value time.Duration)
RunOutputFlusher()
}
StatsManager represents a tracker of statistics related to the crawler. This interface is unfortunately quite big as it needs to support several operations on the statistics it keeps track of.
type URLEntity ¶
type URLEntity struct {
// NetLoc represents the NetLoc portion of the URL
NetLoc string
// Raw represents the entire URL
Raw string
}
URLEntity represents a URL.
func ExtractURL ¶
ExtractURL takes any URL and returns a URL string with scheme,authority,path ready to be used as a parent URL.
type WebClient ¶
type WebClient struct {
// contains filtered or unexported fields
}
WebClient is responsible to connect to the links and manage connections to websites. Implements Connector interface.
func NewWebClient ¶
NewWebClient returns a new WebClient.