return ranges
def GetHttpFileSize(url):
length = 0
try:
conn = urllib.urlopen(url)
headers = conn.info().headers
for header in headers:
if header.find('Length') != -1:
length = header.split(':')[-1].strip()
length = int(length)
except Exception, err:
pass
return length
def hasLive(ts):
for t in ts:
if t.isAlive():
return True
return False
def MyHttpGet(url, output=None, connections=4):
"""
arguments:
url, in GBK encoding
output, default encoding, do no convertion
connections, integer
"""
length = GetHttpFileSize(url)
mb = length/1024/1024.0
if length == 0:
raise URLUnreachable
blocks = connections
if output:
filename = output
else:
output = url.split('/')[-1]
ranges = Split(length, blocks)
names = ["%s_%d" %(filename,i) for i in xrange(blocks)]
ts = []
for i in xrange(blocks):
t = HttpGetThread(i, url, names[i], ranges[i])
t.setDaemon(True)
t.start()
ts.append(t)
live = hasLive(ts)
startSize = sum([t.downloaded for t in ts])
startTime = time.time()
etime = 0
while live:
try:
etime = time.time() - startTime
d = sum([t.downloaded for t in ts])/float(length)*100
downloadedThistime = sum([t.downloaded for t in ts])-startSize
try:
rate = downloadedThistime / float(etime)/1024
except:
rate = 100.0
progressStr = u'\rFilesize: %d(%.2fM) Downloaded: %.2f%% Avg rate: %.1fKB/s' %(length, mb, d, rate)
sys.stdout.write(progressStr)
sys.stdout.flush()
#sys.stdout.write('\b'*(len(progressStr)+1))
live = hasLive(ts)
time.sleep(0.2)
except KeyboardInterrupt:
print
print "Exit..."
for n in names:
try:
os.remove(n)
except:
pass
sys.exit(1)
print
print u'耗时: %d:%d, 平均速度:%.2fKB/s' %(int(etime)/60, int(etime)%60,rate)
f = open(filename, 'wb')
for n in names:
f.write(open(n,'rb').read())
try:
os.remove(n)
except:
pass
f.close()
#############################################################################
#
# get artist-title pairs from baidu top songs list
#
#############################################################################
class SongParser(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.songs = {}
self.cursong = ''
self.insong = False
self.newsong = False
self.name = ''
def handle_data(self, text):
txt = text.strip()
if txt == '':
return
res = re.search('^(\d{1,3})\.$', txt)
if res:
rank = int(res.groups()[0])
self.cursong = rank
self.songs[rank] = ''
self.insong = True
self.name = 'artist'
else:
if self.insong:
self.songs[self.cursong] = self.songs[self.cursong] + txt
if txt == ')':
self.insong = False
def GetArtistAndTitle(url):
html = urllib.urlopen(url).read()
html = html.decode('gbk', 'ignore').encode('utf8')
parser = SongParser()
parser.feed(html)
songs = parser.songs
for k, v in songs.items():
pos = v.find('(')
if pos != -1:
title = v[:pos]