#!/usr/bin/python '''rsspubblica.py Screen scraping di repubblica.it, ne ottengo un feed RSS ''' print "Content-type: application/xml\n\n"; import re, urllib uri = "http://www.repubblica.it" doc = urllib.urlopen(uri).read() rt = re.compile('(.*?)', re.S) t = rt.findall(doc) rs = re.compile('(.*?)', re.S) s = rs.findall(doc) rf = re.compile('/sommario 02-->(.*?)(.*?)()', re.S) a = ra.findall(f[0]) ri = re.compile('', re.S) rlink = re.compile('^(.*?)', re.S) def clean(text): text = text.replace(' ','') text = text.replace('\r\n','') text = text.replace('à','a\'') text = text.replace('è','e\'') text = text.replace('é','e\'') text = text.replace('ì','i\'') text = text.replace('ò','o\'') text = text.replace('ù','u\'') r = re.compile('<(.*?)>') text = r.sub(' ', text) return text def cleanDescription(text): text = text.replace('href="/', 'href="' + uri + '/') text = text.replace('<', '<') return text print """ La Repubblica http://cavedoni.com/varie/python/rsspubblica I titoli della home page di repubblica.it it """ for item in range(len(t)): iuri = ri.findall(t[item][1]) zuri = uri + iuri[0] print ' ' for item in range(len(a)): iuri = ri.findall(a[item][0]) zuri = uri + iuri[0] print ' ' print """ """ for item in range(len(s)): iuri = ri.findall(t[item][1]) zuri = uri + iuri[0] print '' print " "+zuri+"" print " " + clean(t[item][1]) + "" print " " + clean(s[item][1]) + "" print "" for item in range(len(a)): iuri = ri.findall(a[item][0]) title = rlink.findall(a[item][0]) zuri = uri + iuri[0] print '' print ' ' + clean(title[0]) + '' print " "+zuri+"" print " " + cleanDescription(a[item][0]) + "" print "" print ""