view scrape-vb.py @ 6:9f3eb9a07966

Add config parser. Add ability to print out an example config file. Add -f option to read a saved page.
author darius
date Wed, 29 Aug 2007 07:37:59 +0000
parents 275603a8e2ae
children bf896507faa9
line wrap: on
line source

#!/usr/bin/env python

############################################################################
# Screen scraper for Virgin Blue to look for happy hour deals
#
# Prints out (and emails) when criteria match based on cost,
# destination, etc
#
# $Id: scrape-vb.py,v 1.5 2007/08/29 07:37:59 darius Exp $
############################################################################
#
# Copyright (C) 2007 Daniel O'Connor. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
############################################################################

import os, re, BeautifulSoup, datetime, time, smtplib, sys, urllib
import ConfigParser, optparse

usage = '''%prog [options]
Reads configuration from ./scrape-vb.ini and ~/.scrape-vb.ini'''

optparse = optparse.OptionParser(usage, version="$Id: scrape-vb.py,v 1.5 2007/08/29 07:37:59 darius Exp $")
optparse.add_option('-d', '--debug', action="store_true", default=False,
                    help="Disable mail sending, prints mail message to stdout")
optparse.add_option('-f', '--file', help="Do not fetch the page, use this file instead")
optparse.add_option('-e', '--example', action="store_true", default=False,
                    help="Print an example configuration file to stdout and exit")
(options, args) = optparse.parse_args()

if (options.example):
    print '''[global]
mailsubj="Subject line for emails"
# The following 3 options are necessary before email will be sent
mailfrom=user@host.com
mailsend=True
mailhost=mail.server.com

[user@host.com]
# All fields are optional
city1=Foo
city2=Bar
when=dd/mm/yy
maxcost=123
'''
    sys.exit(0)
    
parsetitle = re.compile('([a-z ]+) - ([a-z ]+) \$([0-9]+)', re.IGNORECASE)
parsetper = re.compile('Travel Period: ([0-9]+/[0-9]+/[0-9]+) - ([0-9]+/[0-9]+/[0-9]+)', re.IGNORECASE)

conf = ConfigParser.ConfigParser()
conf.add_section('global')
conf.set('global', 'mailsubj', 'Virgin Blue Happy Hour Deals')
conf.set('global', 'vburl', 'http://virginblue.com.au')

conflist = ['scrape-vb.ini']
if ('HOME' in os.environ):
    conflist.append(os.path.expanduser('~/.scrape-vb.ini'))
conf.read(conflist)

try:
    if (options.file != None):
        f = open(options.file)
    else:
        f = urllib.urlopen(conf.get('global', 'vburl'))
except IOError, e:
    print  "Unable to fetch page - " + str(e)
    sys.exit(1)
    
s = BeautifulSoup.BeautifulSoup(f)
hrr = s.find("ul", "happyhr-rows")
if (hrr == None):
    print "No happy hour details found"
    sys.exit(0)
    
hrlist = hrr.findAll("li")

# XXX: I wanted to use findAll('ul', 'happyhr-conditions') but it
# doesn't work
times = parsetper.match(s.findAll('ul')[11].find('li').string)
if (times == None):
    print "Unable to parse travel period " + parsetper.match(s.findAll('ul')[11].find('li'))
    sys.exit(0)
    
frtime = datetime.datetime(*time.strptime(times.group(1), "%d/%m/%y")[0:3])
totime = datetime.datetime(*time.strptime(times.group(2), "%d/%m/%y")[0:3])

output = {}
for i in hrlist:
    href =  i.find('a')
    match = parsetitle.match(href['title'])
    if (match == None):
        print "Unable to match " + str(s)
        continue

    city1 = match.group(1)
    city2 = match.group(2)
    cost = int(match.group(3))
    url = href['href']
    
    for email in conf.sections():
        if (email == 'global'):
            continue
        # Stuff configuration into a dictionary for our convenience
        t = {'email' : email}
        for i in conf.items(email):
            t[i[0]] = i[1]
            
        citymatch = True
        if ('city1' in t and 'city2' in t):
            if((t['city1'] != city1 or t['city2'] != city2) and
               (t['city1'] != city2 or t['city2'] != city1)):
                   citymatch = False
        elif ('city1' in t):
            if (t['city1'] != city1 and t['city1'] != city2):
                citymatch = False
            
        datematch = True
        if ('when' in t):
            travtime = datetime.datetime(*time.strptime(t['when'], "%d/%m/%y")[0:3])
            if (travtime < frtime or travtime > totime):
                datematch = False

        costmatch = True
        if ('maxcost' in t):
            if (cost > int(t['maxcost'])):
                costmatch = False
                
        if (citymatch and datematch and costmatch):
            if (t['email'] not in output):
                
                output[t['email']] = []
            output[t['email']].append([city1, city2, cost, url])

try:
    mailsubj = conf.get('global', 'mailsubj')
    mailhost = conf.get('global', 'mailhost')
    mailsend = conf.getboolean('global', 'mailsend')
    mailfrom = conf.get('global', 'mailfrom')
except ConfigParser.NoOptionError:
    mailsend = False

if (options.debug == True and mailsend):
    print "mailsend overridden due to debugging"
    mailsend = False
    
if (mailsend):
    server = smtplib.SMTP(mailhost)
    #server.set_debuglevel(1)
else:
    print "Note: Mail sending disabled"
    
for o in output:
    if (mailsend):
        msg = ("From: %s\r\nTo: %s\r\nSubject: %s\r\n\r\n" % (mailfrom, o, mailsubj))
        msg = msg + "Your criteria for flights have been matched\r\n\r\n"
    else:
        print "Match for " + o
    for i in output[o]:
        if (mailsend):
            msg = msg + "%s <-> %s costs $%d - %s\r\n" % (i[0], i[1], i[2], i[3])
        else:
            print "%s <-> %s costs $%d" % (i[0], i[1], i[2])

    ttimestr = "Note: travel period is from %s to %s" % \
               (frtime.strftime("%A %e %B %Y"), totime.strftime("%A %e %B %Y"))

    if (mailsend):
        msg = msg + "\r\n" + ttimestr + "\r\n"
        server.sendmail(mailfrom, o, msg)
    else:
        print ttimestr
        print