Sunday 3 August 2014

how to find all ip address and website crawler using python

Standard

i have try lot's of crawler in this world but not fulfil my total requirement
so i decide to create a new crawler  that find all ip address and home page and store to my folder
it's work fine but little bit slow presently i have 70,000 website in my folder in 3 days this program find all ip address all over world

first of all create a folder where your program will store and create a python file with extension (.py) then copy this to the file and save



'''
This is ip finder it's find all ip that have a website all over world
'''
from socket import gethostbyaddr
import threading
import sys
import time
from itertools import product
import urllib2
from netaddr import IPRange
from colorama import Fore, Back, Style

NB_OF_THREADS_MAX = 254

def show_name(ip):
  global lock
  try:
    host=gethostbyaddr(ip)[0]
    store_html(host,ip)
    lock.acquire()
    lock.release()
  except:
    print Fore.BLUE+"%s ------ not resolve\n" % (ip)
    print Fore.RESET
  semaphore.release()

def store_html(host,ip):
  try:
    response = urllib2.urlopen('http://'+host)
    html = response.read()
    fh = open("website/"+ip, "w")
    fh.write(html)
    fh.close()
    print Fore.GREEN+"%s (%s) -------- ok\n"%(host,ip)
    print Fore.RESET
    return True
  except:
    print Fore.RED+"%s (%s) -------- error\n"%(host,ip)
    print Fore.RESET
    return False

def main ():
  global lock
  lock = threading.Lock()
  global semaphore
  semaphore = threading.BoundedSemaphore(value = NB_OF_THREADS_MAX)
  for n in IPRange('1.1.1.1','254.254.254.254'):
    ip_to_test=n.format()
    semaphore.acquire()
    current_thread = threading.Thread(None,show_name,None,(ip_to_test,),None)
    try:
      current_thread.start()
    except:
      print "can't start so many threads."
  while threading.activeCount() != 1:
    time.sleep(2)

main()


now open command pompt and type this command:

$python file_name.py

you will see like bellow screenshoot



green colour for resolve and website exist ip address ,red for resolve but website not exist,blue for not exist

if you want to increase thread then increase NB_OF_THREADS_MAX value this for find no. of ip address parallely

let's celebrate, you have discover your best cocktail

enjoy