Thursday, 29 August 2013

Python Web Crawer Code - testing

It's Just a sample.

You can make more great code.

#Python code.

 #page spider  
 import sys, urlparse, urllib  
 from bs4 import BeautifulSoup  
 from datetime import datetime  
   
   
 url = "http://hacktizen.blogspot.com/"  
 hostname = urlparse.urlparse(url).hostname.split(".")  
 hostname = ".".join(len(hostname[-2]) < 4 and hostname[-3:] or hostname[-2:])  
   
   
 urls = [url] # Stack of urls to csrape  
 visited = [url] #historic record of urls  
 imgs = []  
 forms = []  
   
 print "Search"  
   
 tstart = datetime.now()  
 while len(urls) > 0:  
   try:  
     htmltext = urllib.urlopen(urls[0]).read()  
   except:  
     print "\r\nexcept:"+urls[0]  
   soup = BeautifulSoup(htmltext)  
   
   urls.pop(0)  
   sys.stdout.write('.')  
     
   for tag in soup.findAll('a', href=True):  
     tag['href'] = urlparse.urljoin(url,tag['href'])  
     if hostname in tag['href'] and tag['href'] not in visited:  
       urls.append(tag['href'])  
       visited.append(tag['href'])  
     
   for tag in soup.findAll('img', src=True):  
     tag['img'] = urlparse.urljoin(url,tag['src'])  
     if hostname in tag['img']:  
       imgs.append(tag['img'])  
       imgs = list(set(imgs))  
   
   for tag in soup.findAll('form', action=True):  
     tag['form'] = urlparse.urljoin(url,tag['action'])  
     if hostname in tag['form']:  
       forms.append(tag['form'])  
       forms = list(set(forms))  
   
   
 tend = datetime.now()  
 tperiod = tend - tstart  
 print("\r\n[URL]")  
 for links in visited:  
   print links  
 print("\r\n[IMGS]")  
 for links in imgs:  
   print links  
 print("\r\n[Forms]")  
 for links in forms:  
   print links  
 print("\r\nTime - "+str(tperiod))