#!/usr/bin/python
'''rsspubblica.py
Screen scraping di repubblica.it, ne ottengo un feed RSS
'''
print "Content-type: application/xml\n\n";
import re, urllib
uri = "http://www.repubblica.it"
doc = urllib.urlopen(uri).read()
rt = re.compile('(.*?)', re.S)
t = rt.findall(doc)
rs = re.compile('(.*?)', re.S)
s = rs.findall(doc)
rf = re.compile('/sommario 02-->(.*?)
(.*?)()', re.S)
a = ra.findall(f[0])
ri = re.compile('', re.S)
rlink = re.compile('^(.*?)', re.S)
def clean(text):
text = text.replace(' ','')
text = text.replace('\r\n','')
text = text.replace('à','a\'')
text = text.replace('è','e\'')
text = text.replace('é','e\'')
text = text.replace('ì','i\'')
text = text.replace('ò','o\'')
text = text.replace('ù','u\'')
r = re.compile('<(.*?)>')
text = r.sub(' ', text)
return text
def cleanDescription(text):
text = text.replace('href="/', 'href="' + uri + '/')
text = text.replace('<', '<')
return text
print """
La Repubblica
http://cavedoni.com/varie/python/rsspubblica
I titoli della home page di repubblica.it
it
"""
for item in range(len(t)):
iuri = ri.findall(t[item][1])
zuri = uri + iuri[0]
print ' '
for item in range(len(a)):
iuri = ri.findall(a[item][0])
zuri = uri + iuri[0]
print ' '
print """
"""
for item in range(len(s)):
iuri = ri.findall(t[item][1])
zuri = uri + iuri[0]
print '- '
print " "+zuri+""
print " " + clean(t[item][1]) + ""
print " " + clean(s[item][1]) + ""
print "
"
for item in range(len(a)):
iuri = ri.findall(a[item][0])
title = rlink.findall(a[item][0])
zuri = uri + iuri[0]
print '- '
print ' ' + clean(title[0]) + ''
print " "+zuri+""
print " " + cleanDescription(a[item][0]) + ""
print "
"
print ""
|