Picrab.py grab images from websites


Want some code to grab all the pictures in a given website website?
Here it is just give the website name, picture size limit and depth to search.
Job done….
When I tested, Picrab I collected more than 2500 pics from website in 15 mins !!
No threading… means one http requests will be sequential
usage : ./picrab.py http://example.com/pictures 9 10000 example
URL recursiondepth img size optionla key
(optional key : if you want to ensure all the fetched urls need to contain this urlkey)

#!/usr/bin/python
#
#Author : Narendra L
#Date : 27th oct 2012
#licence : open source
#file: picrab.py

import urllib,re,sys,time,os,urllib2
from BeautifulSoup import BeautifulSoup

g_img_lst = []
g_href_lst = []
dirname = "picrab"

def getpagedata(url):
    
    """Returns webpage"""

    response = urllib2.urlopen(url=url,timeout = 6)
    data = response.read()
    return data


def filtrimgorurl(g_urlorimg ,fltr_urls, key = ""):

    """Filters given list with global list"""

    g_urlorimg = list(set(g_urlorimg))
    fltr_urls = list(set(fltr_urls))
    if not(key):
        ret_urls = filter((lambda x: not(x in g_urlorimg)), fltr_urls)
    else:
        ret_urls = filter((lambda x: (not(x in g_urlorimg)) and (x.find(key) > 0)), fltr_urls)

    return ret_urls


def tagdatafetcher(page, htmltag, attr):
        
    """Used to getch ing links or urls"""

    soup = BeautifulSoup(page)
    imgsrc_list = soup.findAll(htmltag)
    ret_data = []
    print "updating " + htmltag + " " + attr+ " Links"
for img in imgsrc_list: try: ret_data.append(img[attr]) except:print "wrong HMTL tag" return ret_data def checkdir(dirname): os.makedirs(dirname) key = "" try: url_init = sys.argv[1] if url_init.find("http://") > -1:pass else : "http://" + url_init depth = int(sys.argv[2]) imgsize = int(sys.argv[3]) key = sys.argv[4] except: print "\nhelp ./picrab.py url_to_fetch fetch_dept ing_size_limit\n" print """help : size_limit can 0 to anything keep it to 10000 for morethan passport size image""" sys.exit(1) web_links = [url_init] extn = ['png', 'gif', 'jpg', 'jpeg'] dirname = dirname + str(time.time()) checkdir(dirname) for frst in range(0,depth): for url in web_links: try: print "Fetching WEBPAGE :",url data = getpagedata(url) print "Web ok....." img_links = tagdatafetcher(data, "img", "src") web_links = tagdatafetcher(data, "a", "href") filterd_imglnks = filtrimgorurl(g_img_lst, img_links) filterd_urllnks = filtrimgorurl(g_href_lst, web_links, key) for img in filterd_imglnks: print "Fetching image = ",img data = getpagedata(img) print "Img ok....." length = len(data) if length > imgsize: img_name = img.replace("http://", "").replace("/", "_").replace(" ", "").lower() img_name = dirname + "/" + img_name fileextn = img_name[-4:].replace(".", "") if fileextn in extn:pass else:img_name = img_name + ".jpg" try: img_f = open(img_name,'wb') img_f.write(data) img_f.close() print "Saving Image :"+img_name except: checkdir(dirname) else: print "Not saving small image, size : ",len(data) g_img_lst.extend(img_links) g_href_lst.extend(web_links) except KeyboardInterrupt: print "The End of picrab :)" sys.exit(1) except Exception,e: print "Something really worng",e