view scrape-gm.py @ 15:789cf10ce4c9

Update for new format (for sure)
author darius@Inchoate
date Sun, 14 Dec 2008 18:55:39 +1030
parents 5058c2695109
children eeee17d2072c
line wrap: on
line source

#!/usr/bin/env python

############################################################################
# Screen scraper for game-monitor.com
#
# Prints out matched player names agreated by server
#
############################################################################
#
# Copyright (C) 2008 Daniel O'Connor. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
############################################################################

import re, time, datetime, urllib, sys, BeautifulSoup

debug = False

class Server:
    alltags = re.compile('<[^>]*>')
    vwhttags = re.compile('<(br|hr)>')
    hwhttags = re.compile('\&nbsp;')
    typetag = re.compile('<td><a href="/GameSearch/([^/]+)/.*</td>')
    
    def __init__(self, description = "", ip = "", port = 0, mapname = "",
                 updateage = 0, numplayers = 0, maxplayers = 0, players = []):
        self.description = description
        self.ip = ip
        self.port = port
        self.mapname = mapname
        self.updateage = int(updateage)
        self.players = []
        self.numplayers = numplayers
        self.maxplayers = maxplayers

    def __init__(self, pcols, scols):
        # pcols[1] = Player name
        # pcols[2] = Server description
        # scols[0] = Players in server / max players
        # scols[1] = Server IP & port
        # scols[2] = Map name
        # scols[3] = Game type
        # scols[8] = Update age
        if debug:
            print "pcols = " + str(pcols)
            print "scols = " + str(scols)
            
        self.pcountre = re.compile("([0-9]+)/([0-9]+)")
        self.ipportre = re.compile("([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+):([0-9]+)")
        self.sdesc = re.compile(" +[0-9]+\. +(.*)")

        m = self.sdesc.match(pcols[2])
        if (m == None):
            raise SyntaxError
        self.description = m.group(1)
        
        m = self.ipportre.match(scols[1])
        if (m == None):
            raise SyntaxError
        
        self.ip = m.group(1)
        self.port = int(m.group(2))
        self.gametype = scols[3]
        self.mapname = scols[2]
        self.updateage = scols[8]
        m = self.pcountre.match(scols[0])
        if (m == None):
            raise SyntaxError
        
        self.numplayers = int(m.group(1))
        self.maxplayers = int(m.group(2))
        self.players = []

    def __str__(self):
        plist = ""
        for p in self.players:
            plist = plist + " " + str(p)
        
        return "%s: %s (%s:%d) | Map: %s | Players: %d/%d : %s (%s old)" % \
               (self.gametype, self.description, self.ip, self.port, self.mapname,
                self.numplayers, self.maxplayers, plist,
                self.updateage)
    
    def GetTuple(scols):
        return str(scols[2]) + ":" + str(scols[3])
    GetTuple = staticmethod(GetTuple)

    def FixTags(s):
        # Mangle game type
        t = Server.typetag.match(s)
        if t != None:
            s = t.group(1)
        s = re.sub(Server.vwhttags, '\n', s)
        s = re.sub(Server.hwhttags, '', s)
        s = str(BeautifulSoup.BeautifulStoneSoup( \
                s, convertEntities = BeautifulSoup.BeautifulStoneSoup.XML_ENTITIES))
        s = re.sub(Server.alltags, '', s)
        return(s)
    FixTags = staticmethod(FixTags)
    
    def Scrape(handle):
        s = BeautifulSoup.BeautifulSoup(handle)

        playertbl = s.find("table", "results")
        if (playertbl == None):
            if True:
                print "Unable to find results"
            return None
        
        servertbl = playertbl.findNext("table")
    
        playerrows = playertbl.findAll("tr")
        serverrows = servertbl.findAll("tr")

        if (len(playerrows) != len(serverrows)):
            print "Internal error 41223"
            return

        servers = {}
        for i in range(len(playerrows[1:])):
            pcols = playerrows[i].findAll('td')
            scols = serverrows[i].findAll('td')
            if (len(pcols) != 3):
                if debug:
                    print "pcols has length %d, expected 3" % len(pcols)
                continue
        
            pcols = map(lambda c : Server.FixTags(str(c)), pcols)
            scols = map(lambda c : Server.FixTags(str(c)), scols)

            stuple = Server.GetTuple(scols)

            if (stuple not in servers):
                s = Server(pcols, scols)
                servers[stuple] = s
            
            servers[stuple].addplayer(pcols[1])

        return servers
    Scrape = staticmethod(Scrape)
    
    def addplayer(self, pname):
        self.players.append(pname)
    
    
if True:
    maxhits = 10
    if (len(sys.argv) < 2):
        print "Bad usage"
        print sys.argv[0] + "search_string"
        sys.exit(1)
    
    try:
        #f = open("gm.html")
        f = urllib.urlopen("http://www.game-monitor.com/search.php?location=AU&search=" + urllib.quote(sys.argv[1]) + "&type=player&location=AU")
    except IOError, e:
        print "Unable to fetch page - " + str(e)
        sys.exit(0)
    
    servers = Server.Scrape(f)
    del f
    if (servers == None):
        print "No results available, please check manually"
    elif (len(servers) == 0):
        print "No players found"
    else:
        tmp = []
        for i in servers:
            tmp.append(servers[i])
        tmp.sort()
        i = 0
        for s in tmp:
            i = i + 1
            print s
            if (i >= maxhits):
                print "*** Stopping after " + str(maxhits) + " hits"
                break