a  ze$@s\dZddlZddlZddlZdgZeddZGdddZGdddZ Gd d d Z dS) a% robotparser.py Copyright (C) 2000 Bastian Kleineidam You can choose between two licenses when using this package: 1) GNU GPLv2 2) PSF license for Python 2.2 The robots.txt Exclusion Protocol is implemented as specified in http://www.robotstxt.org/norobots-rfc.txt NRobotFileParser RequestRatezrequests secondsc@sreZdZdZdddZddZddZd d Zd d Zd dZ ddZ ddZ ddZ ddZ ddZddZdS)rzs This class provides a set of methods to read, parse and answer questions about a single robots.txt file. cCs2g|_g|_d|_d|_d|_||d|_dS)NFr)entriessitemaps default_entry disallow_all allow_allset_url last_checkedselfurlrF/opt/bitninja-python-dojo/embedded/lib/python3.9/urllib/robotparser.py__init__s zRobotFileParser.__init__cCs|jS)zReturns the time the robots.txt file was last fetched. This is useful for long-running web spiders that need to check for new robots.txt files periodically. )r r rrrmtime%szRobotFileParser.mtimecCsddl}||_dS)zYSets the time the robots.txt file was last fetched to the current time. rN)timer )r rrrrmodified.szRobotFileParser.modifiedcCs&||_tj|dd\|_|_dS)z,Sets the URL referring to a robots.txt file.N)rurllibparseurlparsehostpathr rrrr 6szRobotFileParser.set_urlc Csztj|j}WnTtjjyf}z8|jdvr8d|_n|jdkrR|jdkrRd|_WYd}~n&d}~00| }| | d dS)z4Reads the robots.txt URL and feeds it to the parser.)iiTiiNzutf-8) rZrequesturlopenrerror HTTPErrorcoderr readrdecode splitlines)r ferrrawrrrr!;s zRobotFileParser.readcCs,d|jvr|jdur(||_n |j|dSN*) useragentsrrappend)r entryrrr _add_entryHs  zRobotFileParser._add_entrycCsPd}t}||D]}|sP|dkr4t}d}n|dkrP||t}d}|d}|dkrn|d|}|}|s|q|dd}t|dkr|d|d<tj |d|d<|ddkr|dkr||t}|j |dd}q|ddkr.|dkr6|j t|dd d}q|dd krb|dkr6|j t|dd d}q|dd kr|dkr6|drt|d|_d}q|dd kr|dkr6|dd}t|dkr|dr|drtt|dt|d|_d}q|ddkr|j |dq|dkrL||dS)zParse the input lines from a robots.txt file. We allow that a user-agent: line is not preceded by one or more blank lines. rr#N:z user-agentZdisallowFZallowTz crawl-delayz request-rate/Zsitemap)Entryrr,findstripsplitlenlowerrrunquoter)r* rulelinesRuleLineisdigitintdelayrreq_rater)r linesstater+lineiZnumbersrrrrQsj                zRobotFileParser.parsecCs|jr dS|jrdS|jsdStjtj|}tjdd|j|j |j |j f}tj |}|sfd}|j D]}||rl||Sql|jr|j|SdS)z=using the parsed robots.txt decide if useragent can fetch urlFTrr0)rr r rrrr7 urlunparserparamsqueryfragmentquoter applies_to allowancer)r useragentrZ parsed_urlr+rrr can_fetchs&     zRobotFileParser.can_fetchcCs>|s dS|jD]}||r|jSq|jr:|jjSdSN)rrrGr<rr rIr+rrr crawl_delays   zRobotFileParser.crawl_delaycCs>|s dS|jD]}||r|jSq|jr:|jjSdSrK)rrrGr=rrLrrr request_rates   zRobotFileParser.request_ratecCs|js dS|jSrK)rrrrr site_mapsszRobotFileParser.site_mapscCs,|j}|jdur||jg}dtt|S)Nz )rrjoinmapstr)r rrrr__str__s  zRobotFileParser.__str__N)r)__name__ __module__ __qualname____doc__rrrr r!r,rrJrMrNrOrSrrrrrs    I  c@s(eZdZdZddZddZddZdS) r9zoA rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path.cCs<|dkr|sd}tjtj|}tj||_||_dS)NrT)rrrBrrFrrH)r rrHrrrrs  zRuleLine.__init__cCs|jdkp||jSr')r startswith)r filenamerrrrGszRuleLine.applies_tocCs|jr dndd|jS)NZAllowZDisallowz: )rHrrrrrrSszRuleLine.__str__N)rTrUrVrWrrGrSrrrrr9sr9c@s0eZdZdZddZddZddZdd Zd S) r1z?An entry has one or more user-agents and zero or more rulelinescCsg|_g|_d|_d|_dSrK)r)r8r<r=rrrrrszEntry.__init__cCsg}|jD]}|d|q |jdur<|d|j|jdurf|j}|d|jd|j|tt|j d |S)Nz User-agent: z Crawl-delay: zRequest-rate: r0 ) r)r*r<r=ZrequestsZsecondsextendrQrRr8rP)r retagentZraterrrrSs   z Entry.__str__cCsF|dd}|jD](}|dkr*dS|}||vrdSqdS)z2check if this entry applies to the specified agentr0rr(TF)r4r6r))r rIr]rrrrGs zEntry.applies_tocCs$|jD]}||r|jSqdS)zZPreconditions: - our agent applies to this entry - filename is URL decodedT)r8rGrH)r rYr@rrrrH s   zEntry.allowanceN)rTrUrVrWrrSrGrHrrrrr1s   r1) rW collections urllib.parserZurllib.request__all__ namedtuplerrr9r1rrrrs  B