import Queue
import random

#Simple Crawlers

class Page:
   def __init__(self,name):
       self.links = []
       self.name = name

 
   def addLink(self,page):
       self.links.append(page)

   def crawl(self):
       print 'Crawling page',self.name

   def isupdated(self):
       return random.uniform(0,1)<self.updateprob

page_a = Page('a')
page_b = Page('b')
page_c = Page('c')
page_d = Page('d')
page_e = Page('e')
page_f = Page('f')
page_g = Page('g')
page_h = Page('h')

page_a.addLink(page_b)
page_a.addLink(page_c)

#Creating links from page B
page_b.addLink(page_d)
page_b.addLink(page_e)
page_b.addLink(page_f)

#Creating links from page C
page_c.addLink(page_g)
page_c.addLink(page_h)


print 'Width First Crawl'
#Width First Crawl - Assumes only page A is known
notyetcrawled = Queue.Queue()
notyetcrawled.put(page_a)

#Order of the following should be A, B,C, D,E,F,G,H
while not notyetcrawled.empty():
    page = notyetcrawled.get()
    page.crawl()
    for link in page.links:
        notyetcrawled.put(link)

print '\n\nDepth First Crawl'
#Depth First Crawl - Assumes only page A is known
#Order of the following should be: A,B,D,E,F,C,G,H
def crawlpage(page):
    page.crawl()
    for link in page.links:
        crawlpage(link)
crawlpage(page_a)

#Path ascending crawler - Not implemented

#Random walk
print '\n\nRandom Walk'
def crawlpage(page):
    page.crawl()
    if page.links:
       link = random.sample(page.links,1)[0]
       crawlpage(link)
crawlpage(page_a)

#Focused crawl
page_a.keywords = ['car','boat']
page_b.keywords = ['cat','dog']
page_c.keywords = ['car','mazda','toyota']
page_d.keywords = ['farm animals','horse','cat','dog']
page_e.keywords = ['news','electoric']
page_f.keywords = ['car','volvo','bmw']
page_g.keywords = ['mobile phone','computer']
page_h.keywords = ['open source','linux','firefox']


#Note that the crawler assumes that the extracted keywords are known prior to the crawl.
print '\n\nFocused crawl'
def crawlpage(page,keyword):
    if keyword in page.keywords:
        page.crawl()
    for link in page.links:
       crawlpage(link,keyword)
    
print 'Keyword = Car'
crawlpage(page_a,'car')

print 'Keyword = Cat'
crawlpage(page_a,'cat')


#Incremental web crawlers
print '\n\nIncremental crawles'
#Note that all pages are assumes known before the crawl. Note also that update probabilities of pages are only in very rare cases fixed and deterministic.
page_a.updateprob = 0.1
page_b.updateprob = 0.2
page_c.updateprob = 0.3
page_d.updateprob = 0.4
page_e.updateprob = 0.5
page_f.updateprob = 0.6
page_g.updateprob = 0.7
page_h.updateprob = 1.0


allpages = [page_a,page_b,page_c,page_d,page_e,page_f,page_g,page_h]

c = 3 #Capacity
d = 1000 #Duration e.g. 100 days
numpages = len(allpages)

#Round robin

print '\n\nRound Robin'

numtruedownloads = 0
numvisits = 0

for i in range(c*d):
   page = allpages[i%numpages]
   numtruedownloads += int(page.isupdated())
   numvisits += 1

print 'Number of true downloads:',numtruedownloads,float(numtruedownloads)/(c*d)*100,'%.'
print 'Average capacity used:',(numvisits/float(d)/c)*100,'%' 

#Proportional
print '\n\nProportional'
#Note that proportional needs to estimate the update probabilities. This is almost imposible a real web environment and does not work if pages change their update probability.
allupdateprob = sum([page.updateprob for page in allpages])
#All probabilities must sum to the capacity
adjustment = c/allupdateprob
for page in allpages:
    page.visitprob = page.updateprob*adjustment

#Making sure the sum of the visiting probabilities is equal to the capacity
if sum([page.visitprob for page in allpages])==float(c):
   print 'Adjustment successful'
else:
   print 'Error in adjustments'

numtruedownloads = 0
numvisits = 0
for time in range(d):
   for page in allpages:
       if random.uniform(0,1)<page.visitprob:
           numtruedownloads += int(page.isupdated())
           numvisits += 1

print 'Number of true downloads:',numtruedownloads,float(numtruedownloads)/(c*d)*100,'%'
print 'Average capacity used:',(numvisits/float(d)/c)*100,'%' 
print 'Percentage of true downloads adjusted for capacity exceed/not exceed:',((float(numtruedownloads)/(c*d))/(numvisits/float(d)/c)) *100,'%'


#LA-Game
print '\n\nLA-game'
#Note no prior knowledge is assumes by this crawler. This is in contrast to proportional
numstates = 10
initialstate = 5

for page in allpages:
   page.state = initialstate

numtruedownloads = 0
numvisits = 0

for time in range(d):
    punishpages = []
    rewardpages = []
    c_used = 0#Actual capacity used
    for page in allpages:
        if random.uniform(0,1)<float(page.state)/numstates:
            numvisits += 1
            #Crawling
            updated = page.isupdated()
            if updated:
               numtruedownloads += 1
               rewardpages.append(page)
            if not updated:
               punishpages.append(page)
            c_used += 1
    if c_used > c: #Capacity is exceeded
       for page in punishpages:
           page.state -= 1
           if page.state<1:
               page.state = 1
    else:
       for page in rewardpages:
               page.state += 1
               if page.state>numstates:
                  page.state = numstates
           
print 'Number of true downloads:',numtruedownloads,float(numtruedownloads)/(c*d)*100,'%'
print 'Average capacity used:',(numvisits/float(d)/c)*100,'%' 
print 'Percentage of true downloads adjusted for capacity exceed/not exceed:',((float(numtruedownloads)/(c*d))/(numvisits/float(d)/c)) *100,'%'

