下载cnblogs所有博客内容.
写了一个python脚本,简单粗暴
import urllib,os,sys def getUrlContent(url): fp =urllib.urlopen(url) cont =fp.read() fp.close() return cont # to read blog urls in one page class findBlog: def __init__(self,cont): self.p=0 self.cont =cont def get(self): p1 =self.cont.find('<div class="post">',self.p) if p1>0: p2 =self.cont.find('<h2><a id="homepage1_HomePageDays',p1) if p2>0: p3 =self.cont.find('href="',p2) if p3>0: p4 =self.cont.find('">',p3) if p4>0: url =self.cont[(p3+len('href="')):p4] p5 =self.cont.find('</a>',p4) if p5>0: title =self.cont[(p4+len('">')):p5] self.p =p5 return [url, title] return None def logFile(fname, cont): if os.path.isfile(fname): print fname, 'esist!' # return fp=open(fname,'w') fp.write(cont) fp.close() def appendFile(fname, cont): fp=open(fname,'a') fp.write(cont) fp.close() def MyCmd(x): print x os.system(x) #to read all blog contents in all pages class blogReader: def __init__(self): self.is_latest_written =0 self.latest_url ='' #update self.latest_url fname ='cfg.txt' if os.path.isfile(fname): fp =open(fname,'r') self.latest_url =fp.readline().strip() fp.close() print 'latest_url', self.latest_url def readPage(self,pid): is_latest =0 cont =getUrlContent('http://www.cnblogs.com/cutepig/default.html?page=%d&OnlyTitle=1'%pid) fpLog =open('log.txt','a') fb =findBlog(cont) print >>fpLog, '--------page', pid print '--------page', pid while 1: ret =fb.get() if ret is None: break [url, title] =ret print >>fpLog, ret #print ret #why cannot print chinses? if not self.is_latest_written: logFile('cfg.txt', url) self.is_latest_written =1 print title.decode('utf-8') if url==self.latest_url: is_latest =1 break blogFname =url.replace(';','').replace('&','').replace('?','').replace(':','').replace('/','')+'.htm' logFile( blogFname, getUrlContent(url)) appendFile( 'index2.htm', '<a href=%s>%s</a><br>\n'%(blogFname, title)) fpLog.close() MyCmd('copy /y index2.htm+index.htm index.htm') return is_latest def read_all(self): pid =1 while 1: is_latest =self.readPage(pid) if is_latest: break pid =pid+1 br =blogReader() br.read_all()