robotparser
index
/usr/lib/python2.1/robotparser.py

 robotparser.py
 
Copyright (C) 2000  Bastian Kleineidam
 
You can choose between two licenses when using this package:
1) GNU GPLv2
2) PYTHON 2.0 OPEN SOURCE LICENSE
 
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html

 
Modules
            
re
urllib
urlparse

 
Classes
            
Entry
urllib.FancyURLopener(urllib.URLopener)
URLopener
RobotFileParser
RuleLine

 
class Entry
      An entry has one or more user-agents and zero or more rulelines
 
  
__init__(self)
__str__(self)
allowance(self, filename)
Preconditions:
- our agent applies to this entry
- filename is URL decoded
applies_to(self, useragent)
check if this entry applies to the specified agent

 
class RobotFileParser
       
  
__init__(self, url='')
__str__(self)
can_fetch(self, useragent, url)
using the parsed robots.txt decide if useragent can fetch url
modified(self)
mtime(self)
parse(self, lines)
parse the input lines from a robot.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines.
read(self)
set_url(self, url)

 
class RuleLine
      A rule line is a single "Allow:" (allowance==1) or "Disallow:"
(allowance==0) followed by a path.
 
  
__init__(self, path, allowance)
__str__(self)
applies_to(self, filename)

 
class URLopener(urllib.FancyURLopener)
       
  
__del__(self) from urllib.URLopener
__init__(self, *args)
addheader(self, *args) from urllib.URLopener
cleanup(self) from urllib.URLopener
close(self) from urllib.URLopener
get_user_passwd(self, host, realm, clear_cache=0) from urllib.FancyURLopener
http_error(self, url, fp, errcode, errmsg, headers, data=None) from urllib.URLopener
http_error_301(self, url, fp, errcode, errmsg, headers, data=None) from urllib.FancyURLopener
http_error_302(self, url, fp, errcode, errmsg, headers, data=None)
http_error_401(self, url, fp, errcode, errmsg, headers, data=None) from urllib.FancyURLopener
http_error_default(self, url, fp, errcode, errmsg, headers)
open(self, fullurl, data=None) from urllib.URLopener
open_data(self, url, data=None) from urllib.URLopener
open_file(self, url) from urllib.URLopener
open_ftp(self, url) from urllib.URLopener
open_gopher(self, url) from urllib.URLopener
open_http(self, url, data=None) from urllib.URLopener
open_https(self, url, data=None) from urllib.URLopener
open_local_file(self, url) from urllib.URLopener
open_unknown(self, fullurl, data=None) from urllib.URLopener
open_unknown_proxy(self, proxy, fullurl, data=None) from urllib.URLopener
prompt_user_passwd(self, host, realm) from urllib.FancyURLopener
redirect_internal(self, url, fp, errcode, errmsg, headers, data) from urllib.FancyURLopener
retrieve(self, url, filename=None, reporthook=None, data=None) from urllib.URLopener
retry_http_basic_auth(self, url, realm, data=None) from urllib.FancyURLopener
retry_https_basic_auth(self, url, realm, data=None) from urllib.FancyURLopener

 
Functions
            
_check(a, b)
_debug(msg)
_test()

 
Data
             __all__ = ['RobotFileParser']
__file__ = '/usr/lib/python2.1/robotparser.pyc'
__name__ = 'robotparser'
debug = 0