robotparser
index
/usr/local/lib/python2.3/robotparser.py

robotparser.py
 
Copyright (C) 2000  Bastian Kleineidam
 
You can choose between two licenses when using this package:
1) GNU GPLv2
2) PSF license for Python 2.2
 
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html

 
Modules
       
urllib
urlparse

 
Classes
       
Entry
urllib.FancyURLopener(urllib.URLopener)
URLopener
RobotFileParser
RuleLine

 
class Entry
    An entry has one or more user-agents and zero or more rulelines
 
  Methods defined here:
__init__(self)
__str__(self)
allowance(self, filename)
Preconditions:
- our agent applies to this entry
- filename is URL decoded
applies_to(self, useragent)
check if this entry applies to the specified agent

 
class RobotFileParser
    This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
 
  Methods defined here:
__init__(self, url='')
__str__(self)
can_fetch(self, useragent, url)
using the parsed robots.txt decide if useragent can fetch url
modified(self)
Sets the time the robots.txt file was last fetched to the
current time.
mtime(self)
Returns the time the robots.txt file was last fetched.
 
This is useful for long-running web spiders that need to
check for new robots.txt files periodically.
parse(self, lines)
parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines.
read(self)
Reads the robots.txt URL and feeds it to the parser.
set_url(self, url)
Sets the URL referring to a robots.txt file.

 
class RuleLine
    A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path.
 
  Methods defined here:
__init__(self, path, allowance)
__str__(self)
applies_to(self, filename)

 
class URLopener(urllib.FancyURLopener)
    
Method resolution order:
URLopener
urllib.FancyURLopener
urllib.URLopener

Methods defined here:
__init__(self, *args)
http_error_default(self, url, fp, errcode, errmsg, headers)

Methods inherited from urllib.FancyURLopener:
get_user_passwd(self, host, realm, clear_cache=0)
http_error_301(self, url, fp, errcode, errmsg, headers, data=None)
Error 301 -- also relocated (permanently).
http_error_302(self, url, fp, errcode, errmsg, headers, data=None)
Error 302 -- relocated (temporarily).
http_error_303(self, url, fp, errcode, errmsg, headers, data=None)
Error 303 -- also relocated (essentially identical to 302).
http_error_307(self, url, fp, errcode, errmsg, headers, data=None)
Error 307 -- relocated, but turn POST into error.
http_error_401(self, url, fp, errcode, errmsg, headers, data=None)
Error 401 -- authentication required.
See this URL for a description of the basic authentication scheme:
http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt
prompt_user_passwd(self, host, realm)
Override this in a GUI environment!
redirect_internal(self, url, fp, errcode, errmsg, headers, data)
retry_http_basic_auth(self, url, realm, data=None)
retry_https_basic_auth(self, url, realm, data=None)

Methods inherited from urllib.URLopener:
__del__(self)
addheader(self, *args)
Add a header to be used by the HTTP interface only
e.g. u.addheader('Accept', 'sound/basic')
cleanup(self)
close(self)
http_error(self, url, fp, errcode, errmsg, headers, data=None)
Handle http errors.
Derived class can override this, or provide specific handlers
named http_error_DDD where DDD is the 3-digit error code.
open(self, fullurl, data=None)
Use URLopener().open(file) instead of open(file, 'r').
open_data(self, url, data=None)
Use "data" URL.
open_file(self, url)
Use local file or FTP depending on form of URL.
open_ftp(self, url)
Use FTP protocol.
open_gopher(self, url)
Use Gopher protocol.
open_http(self, url, data=None)
Use HTTP protocol.
open_https(self, url, data=None)
Use HTTPS protocol.
open_local_file(self, url)
Use local file.
open_unknown(self, fullurl, data=None)
Overridable interface to open unknown URL type.
open_unknown_proxy(self, proxy, fullurl, data=None)
Overridable interface to open unknown URL type.
retrieve(self, url, filename=None, reporthook=None, data=None)
retrieve(url) returns (filename, headers) for a local object
or (tempfilename, headers) for a remote object.

Data and other attributes inherited from urllib.URLopener:
version = 'Python-urllib/1.15'

 
Data
        __all__ = ['RobotFileParser']
debug = 0