Web site spider tool
from HTMLParser import HTMLParser
import urllib2
url = "http://www.ikman.lk/" # Site going to be spidering
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
class myparser(HTMLParser):
def handle_starttag(self, tag, attrs):
if (tag =="a"):
for a in attrs:
if (a[0] =="href"):
link = a[1]
if (link.find("http")>=0):
print(link)
else:
if link[:1] is '/':
print(url[:-1]+link)
else:
print(url[:-1]+'/'+link)
#reqi = urllib2.Request(link)
#respi = urllib2.urlopen(reqi)
#newparser = myparser()
#newparser.feed(respi.read())
req = urllib2.Request(url, headers=hdr)
resp = urllib2.urlopen(req)
parsers = myparser()
parsers.feed(resp.read())
Comments
Post a Comment