You can make more great code.
#Python code.
#page spider
import sys, urlparse, urllib
from bs4 import BeautifulSoup
from datetime import datetime
url = "http://hacktizen.blogspot.com/"
hostname = urlparse.urlparse(url).hostname.split(".")
hostname = ".".join(len(hostname[-2]) < 4 and hostname[-3:] or hostname[-2:])
urls = [url] # Stack of urls to csrape
visited = [url] #historic record of urls
imgs = []
forms = []
print "Search"
tstart = datetime.now()
while len(urls) > 0:
try:
htmltext = urllib.urlopen(urls[0]).read()
except:
print "\r\nexcept:"+urls[0]
soup = BeautifulSoup(htmltext)
urls.pop(0)
sys.stdout.write('.')
for tag in soup.findAll('a', href=True):
tag['href'] = urlparse.urljoin(url,tag['href'])
if hostname in tag['href'] and tag['href'] not in visited:
urls.append(tag['href'])
visited.append(tag['href'])
for tag in soup.findAll('img', src=True):
tag['img'] = urlparse.urljoin(url,tag['src'])
if hostname in tag['img']:
imgs.append(tag['img'])
imgs = list(set(imgs))
for tag in soup.findAll('form', action=True):
tag['form'] = urlparse.urljoin(url,tag['action'])
if hostname in tag['form']:
forms.append(tag['form'])
forms = list(set(forms))
tend = datetime.now()
tperiod = tend - tstart
print("\r\n[URL]")
for links in visited:
print links
print("\r\n[IMGS]")
for links in imgs:
print links
print("\r\n[Forms]")
for links in forms:
print links
print("\r\nTime - "+str(tperiod))
No comments:
Post a Comment