view scrape-vb.py @ 9:3e03facad74b default tip

New example files for latest layout.
author darius
date Thu, 18 Oct 2007 06:58:00 +0000
parents d17fd6f3a492
children
line wrap: on
line source

#!/usr/bin/env python

############################################################################
# Screen scraper for Virgin Blue to look for happy hour deals
#
# Prints out (and emails) when criteria match based on cost,
# destination, etc
#
# $Id: scrape-vb.py,v 1.7 2007/10/18 06:57:35 darius Exp $
############################################################################
#
# Copyright (C) 2007 Daniel O'Connor. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
############################################################################

import os, re, BeautifulSoup, datetime, time, smtplib, sys, urllib
import ConfigParser, optparse, SMSVodaAu

usage = '''%prog [options]
Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini'''

optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.7 2007/10/18 06:57:35 darius Exp $")
optparse.add_option('-d', '--debug', action="store_true", default=False,
                    help="Disable mail & SMS sending, prints message to stdout")
optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead")
optparse.add_option('-e', '--example', action="store_true", default=False,
                    help="Print an example configuration file to stdout and exit")
(options, args) = optparse.parse_args()

if (options.example):
    print '''[global]
mailsubj="Subject line for emails"
# The following 3 options are necessary before email will be sent
mailfrom=user@host.com
mailsend=True
mailhost=mail.server.com
smsuser=0412312312
smspass=mys3krit
smssend=True

[user@host.com]
# All fields are optional
city1=Foo
city2=Bar
when=dd/mm/yy
maxcost=123
phone=0498765432
'''
    sys.exit(0)
    
parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE)

conf = ConfigParser.ConfigParser()
conf.add_section('global')
conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals')
conf.set('global', 'vburl', 'http://virginblue.com.au')

conflist = ['scrape-vb.ini']
if ('HOME' in os.environ):
    conflist.append(os.path.expanduser('~/.scrape-vb.ini'))
conf.read(conflist)

try:
    if (options.file != None):
        f = open(options.file)
    else:
        f = urllib.urlopen(conf.get('global', 'vburl'))
except IOError, e:
    print  "Unable to fetch page - " + str(e)
    sys.exit(1)

# Test if we have been configured to send SMSs
try:
    smsuser = conf.get('global', 'smsuser')
    smspass = conf.get('global', 'smspass')
    smssend = conf.getboolean('global', 'smssend')
except ConfigParser.NoOptionError:
    smssend = False

if (options.debug == True and smssend):
    print "smssend overridden due to debugging"
    smssend = False

if (smssend):
    smshndl = SMSVodaAu.SMSVodaAu(smsuser, smspass)

s = BeautifulSoup.BeautifulSoup(f)
citypairs = s.findAll("td", "city-pair")
if (citypairs == []):
    print "No happy hour details found"
    sys.exit(0)

prices = s.findAll("td", "dash-r price")
if (prices == []):
    print "Couldn't find prices"
    sys.exit(0)

if (len(citypairs) != len(prices)):
    print "City pair & price tables don't have equal size"
    sys.exit(0)

times = parsetper.search(s.find('p', 'tandc').string)
if (times == None):
    print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li'))
    sys.exit(0)
    
frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3])
totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3])

#
# Go through the HTML and work out who wants to be notified of what
#
# Store in output, a dictionary keyed by email adddress which holds a
# list of each matching flight (city1, city2, cost, url)
#
output = {}
for i, p in zip(citypairs, prices):
    href =  i.find('a')

    city1 = href.next.strip()
    city2 = href.next.next.next.next.next.strip()
    cost = int(p.find('a').string.strip('$^ '))
    url = href['href']
    
    for email in conf.sections():
        if (email == 'global'):
            continue
        # Stuff configuration into a dictionary for our convenience
        t = {'email' : email}
        for i in conf.items(email):
            t[i[0]] = i[1]
            
        citymatch = True
        if ('city1' in t and 'city2' in t):
            if((t['city1'] != city1 or t['city2'] != city2) and
               (t['city1'] != city2 or t['city2'] != city1)):
                   citymatch = False
        elif ('city1' in t):
            if (t['city1'] != city1 and t['city1'] != city2):
                citymatch = False
            
        datematch = True
        if ('when' in t):
            travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3])
            if (travtime < frtime or travtime > totime):
                datematch = False

        costmatch = True
        if ('maxcost' in t):
            if (cost > int(t['maxcost'])):
                costmatch = False
                
        if (citymatch and datematch and costmatch):
            if (t['email'] not in output):
                
                output[t['email']] = []
            output[t['email']].append([city1, city2, cost, url])

# Test if we have been configured to send email
try:
    mailsubj = conf.get('global', 'mailsubj')
    mailhost = conf.get('global', 'mailhost')
    mailsend = conf.getboolean('global', 'mailsend')
    mailfrom = conf.get('global', 'mailfrom')
except ConfigParser.NoOptionError:
    mailsend = False

if (options.debug == True and mailsend):
    print "mailsend overridden due to debugging"
    mailsend = False
    
if (mailsend):
    server = smtplib.SMTP(mailhost)
    #server.set_debuglevel(1)

#
# Output the various notifications
#
ttimestr = "Note: travel period is from %s to %s" % \
           (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y"))

# Email each person about their flights    
if (mailsend):
    for o in output:
        msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj)
        msg = msg + "Your criteria for flights have been matched\r\n\r\n"
        for i in output[o]:
            msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3])

        msg = msg + "\r\n" + ttimestr + "\r\n"
        server.sendmail(mailfrom, o, msg)

else:
    # If not emailing print to stdout
    for o in output:
        print "Match for " + o
        for i in output[o]:
            print "%s <-> %s costs $%d" % (i[0], i[1], i[2])

# SMS each person about their flights
if (smssend):
    for o in output:
        if (conf.has_option(o, 'phone')):
            pnum = conf.get(o, 'phone')
            msg = ""
            for i in output[o]:
                msg = msg + "%s <-> %s $%d, " % (i[0], i[1], i[2])
            # Chop off the last , & make sure the whole message is not
            # too large.
            msgend = min(len(msg) - 2, 160)
            print msg[0:msgend]
            try:
                smshndl.sendamsg(pnum, msg[0:msgend])
                print "Sent SMS to " + pnum
            except:
                print "Unable to send SMS to " + pnum