robotparser
index
/usr/lib/python2.2/robotparser.py

 robotparser.py
 
Copyright (C) 2000  Bastian Kleineidam
 
You can choose between two licenses when using this package:
1) GNU GPLv2
2) PYTHON 2.0 OPEN SOURCE LICENSE
 
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html

 
Modules
            
re
urllib
urlparse
 
Classes
            
Entry
urllib.FancyURLopener(urllib.URLopener)
URLopener
RobotFileParser
RuleLine
 
class Entry
      An entry has one or more user-agents and zero or more rulelines
 
   Methods defined here:
__init__(self)
__str__(self)
allowance(self, filename)
Preconditions:
- our agent applies to this entry
- filename is URL decoded
applies_to(self, useragent)
check if this entry applies to the specified agent

Data and non-method functions defined here:
__doc__ = 'An entry has one or more user-agents and zero or more rulelines'
__module__ = 'robotparser'
 
class RobotFileParser
       
   Methods defined here:
__init__(self, url='')
__str__(self)
can_fetch(self, useragent, url)
using the parsed robots.txt decide if useragent can fetch url
modified(self)
mtime(self)
parse(self, lines)
parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines.
read(self)
set_url(self, url)

Data and non-method functions defined here:
__doc__ = None
__module__ = 'robotparser'
 
class RuleLine
      A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path.
 
   Methods defined here:
__init__(self, path, allowance)
__str__(self)
applies_to(self, filename)

Data and non-method functions defined here:
__doc__ = 'A rule line is a single "Allow:" (allowance==1)...low:"\n (allowance==0) followed by a path.'
__module__ = 'robotparser'
 
class URLopener(urllib.FancyURLopener)
       
  
Method resolution order:
URLopener
urllib.FancyURLopener
urllib.URLopener

Methods defined here:
__init__(self, *args)
http_error_302(self, url, fp, errcode, errmsg, headers, data=None)
http_error_default(self, url, fp, errcode, errmsg, headers)

Data and non-method functions defined here:
__doc__ = None
__module__ = 'robotparser'

Methods inherited from urllib.FancyURLopener:
get_user_passwd(self, host, realm, clear_cache=0)
http_error_301(self, url, fp, errcode, errmsg, headers, data=None)
Error 301 -- also relocated (permanently).
http_error_401(self, url, fp, errcode, errmsg, headers, data=None)
Error 401 -- authentication required.
See this URL for a description of the basic authentication scheme:
http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt
prompt_user_passwd(self, host, realm)
Override this in a GUI environment!
redirect_internal(self, url, fp, errcode, errmsg, headers, data)
retry_http_basic_auth(self, url, realm, data=None)
retry_https_basic_auth(self, url, realm, data=None)

Methods inherited from urllib.URLopener:
__del__(self)
addheader(self, *args)
Add a header to be used by the HTTP interface only
e.g. u.addheader('Accept', 'sound/basic')
cleanup(self)
close(self)
http_error(self, url, fp, errcode, errmsg, headers, data=None)
Handle http errors.
Derived class can override this, or provide specific handlers
named http_error_DDD where DDD is the 3-digit error code.
open(self, fullurl, data=None)
Use URLopener().open(file) instead of open(file, 'r').
open_data(self, url, data=None)
Use "data" URL.
open_file(self, url)
Use local file or FTP depending on form of URL.
open_ftp(self, url)
Use FTP protocol.
open_gopher(self, url)
Use Gopher protocol.
open_http(self, url, data=None)
Use HTTP protocol.
open_https(self, url, data=None)
Use HTTPS protocol.
open_local_file(self, url)
Use local file.
open_unknown(self, fullurl, data=None)
Overridable interface to open unknown URL type.
open_unknown_proxy(self, proxy, fullurl, data=None)
Overridable interface to open unknown URL type.
retrieve(self, url, filename=None, reporthook=None, data=None)
retrieve(url) returns (filename, None) for a local object
or (tempfilename, headers) for a remote object.

Data and non-method functions inherited from urllib.URLopener:
_URLopener__tempfiles = None
version = 'Python-urllib/1.15'
 
Functions
            
_check(a, b)
_debug(msg)
_test()
 
Data
             __all__ = ['RobotFileParser']
__file__ = '/usr/lib/python2.2/robotparser.pyc'
__name__ = 'robotparser'
debug = 0