view scrape-vb.py @ 3:89232ea0c3d4

Read configuration from an ini file rather than hard coding it in the source.
author darius
date Mon, 27 Aug 2007 01:42:11 +0000
parents 8045db05180b
children e3f4ef0b6e39
line wrap: on
line source

#!/usr/bin/env python

############################################################################
# Screen scraper for Virgin Blue to look for happy hour deals
#
# Prints out (and emails) when criteria match based on cost,
# destination, etc
#
# $Id: scrape-vb.py,v 1.2 2007/08/27 01:42:11 darius Exp $
############################################################################
#
# Copyright (C) 2007 Daniel O'Connor. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
############################################################################

import re, BeautifulSoup, datetime, time, smtplib, sys, urllib, ConfigParser

parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE)
parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE)

conf = ConfigParser.ConfigParser()
conf.add_section('global')
conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals')
conf.set('global', 'vburl', 'http://virginblue.com.au')
conf.read('scrape-vb.ini')

try:
    #f = open("vb-happyhour.html")
    f = urllib.urlopen(vburl)
except IOError, e:
    print  "Unable to fetch page - " + str(e)
    sys.exit(1)
    
s = BeautifulSoup.BeautifulSoup(f)
hrr = s.find("ul", "happyhr-rows")
if (hrr == None):
    print "No happy hour details found"
    sys.exit(0)
    
hrlist = hrr.findAll("li")

# XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it
# doesn't work
times = parsetper.match(s.findAll('ul')[11].find('li').string)
if (times == None):
    print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li'))
    sys.exit(0)
    
frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3])
totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3])

output = {}
for i in hrlist:
    href =  i.find('a')
    match = parsetitle.match(href['title'])
    if (match == None):
        print "Unable to match " + str(s)
        continue

    city1 = match.group(1)
    city2 = match.group(2)
    cost = int(match.group(3))
    url = href['href']
    
    for email in conf.sections():
        if (email == 'global'):
            continue

        t = {'email' : email}
        for i in conf.items(email):
            t[i[0]] = i[1]
            
        citymatch = True
        if ('city1' in t and 'city2' in t):
            if((t['city1'] != city1 or t['city2'] != city2) and
               (t['city1'] != city2 or t['city2'] != city1)):
                   citymatch = False
        elif ('city1' in t):
            if (t['city1'] != city1 and t['city1'] != city2):
                citymatch = False
            
        datematch = True
        if ('when' in t):
            travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3])
            if (travtime < frtime or travtime > totime):
                datematch = False

        costmatch = True
        if ('maxcost' in t):
            if (cost > int(t['maxcost'])):
                costmatch = False
                
        if (citymatch and datematch and costmatch):
            if (t['email'] not in output):
                
                output[t['email']] = []
            output[t['email']].append([city1, city2, cost, url])

try:
    mailsubj = conf.get('global', 'mailsubj')
    mailhost = conf.get('global', 'mailhost')
    mailsend = conf.getboolean('global', 'mailsend')
    mailfrom = conf.get('global', 'mailfrom')
except ConfigParser.NoOptionError:
    mailsend = False
    
if (mailsend):
    server = smtplib.SMTP(mailhost)
    #server.set_debuglevel(1)
else:
    print "Note: Mail sending disabled"
    
for o in output:
    if (mailsend):
        msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj))
        msg = msg + "Your criteria for flights have been matched\r\n\r\n"
    else:
        print "Match for " + o
    for i in output[o]:
        if (mailsend):
            msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3])
        else:
            print "%s <-> %s costs $%d" % (i[0], i[1], i[2])

    ttimestr = "Note: travel period is from %s to %s" % \
               (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y"))

    if (mailsend):
        msg = msg + "\r\n" + ttimestr + "\r\n"
        server.sendmail(mailfrom, o, msg)
    else:
        print ttimestr
        print