python 捉虫工具-白红宇

python 捉虫工具

阅读量：6830 次

发布时间：2019-06-26

本文共 3687 字，大约阅读时间需要 12 分钟。

#!/usr/bin/env pythonfrom sys import argvfrom os import makedirs,unlink,sepfrom os.path import dirname,exists,isdir,splitextfrom string import replace,find,lowerfrom htmllib import HTMLParserfrom urllib import urlretrievefrom urlparse import urlparse,urljoinfrom formatter import DumbWriter,AbstractFormatterfrom cStringIO import StringIOclass Retriever(object):#download Web Pages    def __init__(self,url):        self.url=url        self.file=self.filename(url)    def filename(self,url,deffile='index.htm'):        parsedurl=urlparse(url,'http:',0) ##parse path        path=parsedurl[1]+parsedurl[2]        ext=splitext(path)        if ext[1]=='':            if path[-1]=='/':                path+=deffile            else:                path+='/'+deffile        ldir=dirname(path)   #local directory        if sep!='/':   # os-indep.path separator            ldir=replace(ldir,'/',sep)        if not isdir(ldir):  #create archive dir if nec            if exists(ldir):                unlink(ldir)            makedirs(ldir)        return path    def download(self):  #download Web page        try:            retval=urlretrieve(self.url,self.file)        except IOError:            retval=('***ERROR:invalid URL "%s"' % self.url,)        return retval    def parseAndGetLinks(self): #parse HTML,save links        self.parser=HTMLParser(AbstractFormatter(DumbWriter(StringIO())))        self.parser.feed(open(self.file).read())        self.parser.close()        return self.parser.anchorlistclass Crawler(object):#manage entire crawling process    count=0 #static downloaded page counter    def __init__(self,url):        self.q=[url]        self.seen=[]        self.dom=urlparse(url)[1]    def getPage(self,url):        r=Retriever(url)        retval=r.download()        if retval[0]=='*': #error situation,do not parse            print retval,'...skipping parse'            return        Crawler.count+=1        print '\n(',Crawler.count,')'        print 'URL:',url        print 'FILE:',retval[0]        self.seen.append(url)        links=r.parseAndGetLinks() #get and process links        for eachLink in links:            if eachLink[:4]!='http' and find(eachLink,'://')==-1:                eachLink=urljoin(url,eachLink)            print '* ',eachLink,            if find(lower(eachLink),'mailto:') !=-1:                print '... discarded,mailto link'                continue            if eachLink not in self.seen:                if find(eachLink,self.dom)==-1:                    print '...discarded,not in domain'                else:                    if eachLink not in self.q:                        self.q.append(eachLink)                        print '...new,added to Q'                    else:                        print '...discarded,already processed'            else:                print '...discarded,already processed'    def go(self):#process links in queue        while self.q:            url=self.q.pop()            self.getPage(url)def main():    if len(argv)>1:        url=argv[1]    else:        try:            url=raw_input('Enter starting URL:')        except (KeyboardInterrupt,EOFError):            url=''        if not url:return        robot=Crawler(url)        robot.go()if __name__=='__main__':    main()        另外说明：  unlink(const char *pathname)此函数删除目录项，并有pathname所引用的链接计数减1 硬连接  软链接文件 ln [-s] source_path target_path

　　硬链接用ln existing-file new-link 来使用。当你对同一文件有多个名称时，每个名称被称为该文件的硬链接。即对同一个i节点号有多个文件名。你用rm删除它的某一个文件名对其余的名称和文件内容不会受到影响。

　　软链接是一个小的指针文件，用ln -s real-file ptr-file

　　它可以对目录和其他文件系统进行操作，而硬链接就不行。一般指令会对指针指向的文件进行操作，而rm,mv指令只对软链接本身操作。用ls -l只能看到软连接。

　　符号链接：软链接，指向文件所在目录位置的指针，删除对原始文件无影响。

　　硬链接：指向文件的INODE结点，增加原始文件的引用计数，引用计数为零时则删除原始文件。

转载地址：http://oztkl.baihongyu.com/

你可能感兴趣的文章