view scrape-vb.py @ 1:8045db05180b SCRAPEVB_1_0

Initial revision
author darius
date Sat, 25 Aug 2007 05:17:29 +0000
parents
children 89232ea0c3d4
line wrap: on
line source

#!/usr/bin/env python

############################################################################
# Screen scraper for Virgin Blue to look for happy hour deals
#
# Prints out (and emails) when criteria match based on cost,
# destination, etc
#
# $Id: scrape-vb.py,v 1.1.1.1 2007/08/25 05:17:29 darius Exp $
############################################################################
#
# Copyright (C) 2007 Daniel O'Connor. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
############################################################################

import re, BeautifulSoup, datetime, time, smtplib, sys, urllib

#### Configuration

### Travel criteria
# Supported keys are email, when, city1, city2, maxcost
# email is mandatory. If city2 is not present either city will be
# matched. when and maxcost are optional (will match for any date or
# cost)
travellers = [
    { 'email' : 'darius@dons.net.au', 'city1' : 'Sydney' },
    { 'email' : 'sarah.mahoney@nehta.gov.au', 'city1' : 'Adelaide', 'city2' : 'Brisbane' },
  ]

### Mail host
mailhost = 'mail.dons.net.au'

### Who the email is from
mailfrom = 'darius@dons.net.au'

### What's onn the subject linee
mailsubj = 'Virgin Blue Happy Hour Deals'

### Actually send email?
mailsend = False

### URL to parse
vburl = 'http://virginblue.com.au'

parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE)
parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE)

try:
    #f = open("vb-happyhour.html")
    f = urllib.urlopen(vburl)
except IOError, e:
    print  "Unable to fetch page - " + str(e)
    sys.exit(1)
    
s = BeautifulSoup.BeautifulSoup(f)
hrr = s.find("ul", "happyhr-rows")
if (hrr == None):
    print "No happy hour details found"
    sys.exit(0)
    
hrlist = hrr.findAll("li")

# XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it
# doesn't work
times = parsetper.match(s.findAll('ul')[11].find('li').string)
if (times == None):
    print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li'))
    sys.exit(0)
    
frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3])
totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3])

#print "Travel from %s to %s" % (str(frtime), str(totime))

output = {}
for i in hrlist:
    href =  i.find('a')
    match = parsetitle.match(href['title'])
    if (match == None):
        print "Unable to match " + str(s)
        continue

    city1 = match.group(1)
    city2 = match.group(2)
    cost = int(match.group(3))
    url = href['href']
    
    for t in travellers:
        if ('email' not in t):
            print "No email key found, configuration error?"
            continue
        
        citymatch = True
        if ('city1' in t and 'city2' in t):
            if((t['city1'] != city1 or t['city2'] != city2) and
               (t['city1'] != city2 or t['city2'] != city1)):
                   citymatch = False
        elif ('city1' in t):
            if (t['city1'] != city1 and t['city1'] != city2):
                citymatch = False
            
        datematch = True
        if ('when' in t):
            travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3])
            if (travtime < frtime or travtime > totime):
                datematch = False

        costmatch = True
        if ('maxcost' in t):
            if (cost > int(t['maxcost'])):
                costmatch = False
                
        if (citymatch and datematch and costmatch):
            if (t['email'] not in output):
                
                output[t['email']] = []
            output[t['email']].append([city1, city2, cost, url])

if (mailsend):
    server = smtplib.SMTP(mailhost)
    #server.set_debuglevel(1)

for o in output:
    msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj))
    msg = msg + "Your criteria for flights have been matched\r\n\r\n"
    print "Sending email to " + o
    for i in output[o]:
        print "%s <-> %s costs $%d" % (i[0], i[1], i[2])
        msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3])

    msg = msg + "\r\nNote: travel period is from %s to %s" % \
                 (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y"))
    if (mailsend):
        server.sendmail(mailfrom, o, msg)
    else:
        print msg
    print