#!/usr/bin/env python # -*- coding: utf-8 -*- """ Parse French Sénat and Assemblée Nationale (FR) web site for retrieving who vote what Parse online html pages, extract who vote what, identify (email), sort and build html page ready to be uploaded requires - BeautifulSoup 3 XML/SGML parser (used here as HTML parser) Copyright (c) 2004-2008, Leonard Richardson New-BSD licence http://www.crummy.com/software/BeautifulSoup/ - UrlLib2 processor to open URL, here HTML page from vigicrues.ecologie.gouv.fr Python module - sys recover exception error and message Python module Pierre-Alain Dorange, october 2010 New-BSD Licence """ import sys import BeautifulSoup import urllib2 import re import time, datetime import codecs # used to read/write text file with the correct encoding botName="RetraitesBot/0.1" baseSenatUrl="http://www.senat.fr" # senatUrl="http://www.senat.fr/scrutin-public/2010/scr2010-82.html" senatUrl="http://www.senat.fr/scrutin-public/2010/scr2010-89.html" baseAssembleeUrl="http://www.assemblee-nationale.fr" #assembleeUrl="http://www.assemblee-nationale.fr/13/scrutins/jo0601.asp" assembleeUrl="http://www.assemblee-nationale.fr/13/scrutins/jo0646.asp" deputeFile="depute.html" # load from disk the depute list (do not work from internet, must be fixed) def leaf(tag): """ extract data from the tag """ for text in tag.findAll(text=True): text=text.strip() if text: return text return "" def leaf_list(tag): """ extract data and build a list from the tag """ list=[] for text in tag.findAll(text=True): text=text.strip() if text: list.append(text) return list def senateur_compare(x,y): """ senateur sort function """ if x.area>y.area: return 1 elif x.area==y.area: if x.name>y.name: return 1 elif x.name==y.name: return 0 else: return -1 else: return -1 def depute_compare(x,y): """ depute sort function """ if x.area>y.area: return 1 elif x.area==y.area: if x.name>y.name: return 1 elif x.name==y.name: return 0 else: return -1 else: return -1 def noaccent_str(str): """ remove accent from a string, to made comparaison easier """ try: accent= u"'-éèêëàùçôöîïâñÉÈÊËÀÚÇÔÖÎÏÂÑ" sans_accent= u" eeeeaucooiianEEEEAUCOOIIAN" i=0 while i0: name=n.strip() d=Depute() d.first=first d.name=name d.vote=type list.append(d) print "\t",d.first,d.name,":",type first="" break except: print "error parsing",sys.exc_info() except urllib2.HTTPError, e: print "*",url print "* HTTPError", e.code except urllib2.URLError, e: print "*",self.url print "* URLError", e.reason except: print "*",self.url print "error can't load over internet : ",sys.exc_info() return list def assemblee_identify(list,file): """ parse the deputee list from a html page on local disk (via internet do not works, must be fix) """ try: f=open(file) if f: html=f.read() f.close() try: soup=BeautifulSoup.BeautifulSoup(html) divs=soup.findAll('div') for div in divs: try: ref=div['id'] except: ref="" if ref=="corps_tableau": table=div('table') first=True for tr in table[0]('tr'): if first: first=False else: index=1 idUrl="" idFirst="" idName="" idGroup="" idArea="" id=0 for td in tr('td'): if index==1: #url links=td('a') for link in links: data=link['href'] idUrl=data items=data.split('/') data=items[-1] data=data.split(".")[0] try: id=int(data) except: id=0 if index==3: #first data=leaf(td) data=data.replace(" ","") idFirst=data.strip() if index==4: #name data=leaf(td) data=data.replace(" ","") idName=data.strip() if index==5: #group data=leaf(td) data=data.replace(" ","") idGroup=data.strip() if index==6: #area data=leaf(td) data=data.replace(" ","") idArea=data.strip() index=index+1 match=False if id<>0: idFName=noaccent_str(idFirst+" "+idName) for d in list: fullName=noaccent_str(d.first+" "+d.name) if fullName==idFName: d.id=id d.group=idGroup d.area=idArea d.url="%s%s" % (baseAssembleeUrl,idUrl) match=True if not match: print "nomatch\t%d\t%s\t%s\t%s\t%s\t%s" % (id,idFirst,idName,idGroup,idArea,idUrl) except: print "error parsing",sys.exc_info() except urllib2.HTTPError, e: print "*",url print "* HTTPError", e.code except urllib2.URLError, e: print "*",self.url print "* URLError", e.reason except: print "*",self.url print "error can't load over internet : ",sys.exc_info() def main(): print "--------------------------------------------------------------" print botName print "--------------------------------------------------------------" print "Téléchargement <%s>" % senatUrl senateurs=senat_vote(senatUrl) print print len(senateurs),"sénateur(s)." print for s in senateurs: print s.name,s.vote s.GetData() senateurs.sort(senateur_compare) f=codecs.open("Senat.xls","w",encoding="utf-8") f2=codecs.open("Retraites-Senat.html","w",encoding="utf-8") f2.write(u"") f2.write(u"") f2.write(u"") f2.write(u"Vote des députés sur la réforme des retraites") f2.write(u"

Comment ont voter vos Sénateurs sur la Réforme des Retraites ?

") f2.write(u"

Scrutin XXX

") f2.write(u"

Source : Sénat

" % senatUrl) area="" table=False for s in senateurs: s.log(f) if s.area!=area: area=s.area if table: f2.write(u"") f2.write(u"

%s

" % s.area) f2.write(u"") table=True if len(s.email)>0: f2.write(u"" % (s.email,s.name)) elif len(s.url)>0: f2.write(u"" % (s.url,s.name)) else: f2.write(u"" % s.name) if s.vote=="POUR": f2.write(u"") elif s.vote=="CONTRE": f2.write(u"") else: f2.write(u"" % s.vote) f2.write(u"" % s.group) if table: f2.write(u"
SénateurVoteGroupe
%s
%s
%sPOURCONTRE%s%s
") f2.write(u"") f.close() f2.close() print "--------------------------------------------------------------" print "Téléchargement <%s>" % assembleeUrl deputes=assemblee_vote(assembleeUrl) assemblee_identify(deputes,deputeFile) print print len(deputes),"député(s)." print for d in deputes: print d.first,d.name,d.vote d.GetData() deputes.sort(depute_compare) f=codecs.open("Assemblee.xls","w",encoding="utf-8") f2=codecs.open("Retraites-Assemblee.html","w",encoding="utf-8") f2.write(u"") f2.write(u"") f2.write(u"") f2.write(u"Vote des députés sur la réforme des retraites") f2.write(u"

Comment ont voter vos Députés sur la Réforme des Retraites ?

") f2.write(u"

Scrutin XXX

") f2.write(u"

Source : Assemblée Nationale

" % assembleeUrl) area="" table=False for d in deputes: d.log(f) if d.area!=area: area=d.area if table: f2.write(u"") f2.write(u"

%s

" % d.area) f2.write(u"") table=True if len(d.email)>0: f2.write(u"" % (d.email,d.first,d.name)) elif len(d.url)>0: f2.write(u"" % (d.url,d.first,d.name)) else: f2.write(u"" % (d.first,d.name)) if d.vote=="Pour": f2.write(u"") elif d.vote=="Contre": f2.write(u"") else: f2.write(u"" % d.vote) f2.write(u"" % d.group) if table: f2.write(u"
DéputéVoteGroupe
%s %s
%s %s
%s %sPOURCONTRE%s%s
") f2.write(u"") f.close() f2.close() print "--------------------------------------------------------------" print "OK" if __name__ == '__main__' : main()