Web site spider tool

May 25, 2020

Web site spider tool

from HTMLParser import HTMLParser
import urllib2

url = "http://www.ikman.lk/" # Site going to be spidering
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
class myparser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        
        if (tag =="a"):
            for a in attrs:
                if (a[0] =="href"):
                    link = a[1]
                    if (link.find("http")>=0):
                        print(link)
                    else:
                        if link[:1] is '/':
                            print(url[:-1]+link)
                        else:
                            print(url[:-1]+'/'+link)
                        #reqi = urllib2.Request(link)
                        #respi = urllib2.urlopen(reqi)
                        #newparser = myparser()
                        #newparser.feed(respi.read())

req = urllib2.Request(url, headers=hdr)
resp = urllib2.urlopen(req)
parsers = myparser()
parsers.feed(resp.read())

Search This Blog

CODES.AVI

Web site spider tool

Comments

Post a Comment

Popular Posts

for if else in one line

Logic bomb