hi 我写了一个可以从sla中提取文本的小程序

1 view
Skip to first unread message

Li.Ci

unread,
Mar 4, 2011, 9:17:59 AM3/4/11
to fcctt, fullcircle_clt
只来得及完成提取代码,界面没有 ,用

python test2.py  [filename]

的形式即可

test2.py



#coding:utf-8

import HTMLParser
import sys

class SLAParser(HTMLParser.HTMLParser):
    def __init__(self):       
        HTMLParser.HTMLParser.__init__(self)
    self.data=[]
   
    def handle_starttag(self,tag,attrs):
      
        if tag == 'itext':
            for key,value in attrs:
                if key == 'ch':
                    if len(value)>8:
                        self.data.append(value)
                   
reload(sys)
sys.setdefaultencoding('utf-8')
slap = SLAParser()
fd = open(sys.argv[1])
print sys.argv[1]
slap.feed(fd.read())
for text in slap.data:
    print text

Michael Kang

unread,
Mar 4, 2011, 9:26:43 AM3/4/11
to fc...@googlegroups.com, Li.Ci, fullcircle_clt
If you can read data from sla file, is it possible to insert?

2011/3/4 Li.Ci <dysp...@gmail.com>



--
Michael Kang(康上明学)
There is a giant asleep within every man. When the giant awakens,miracles happen.

Personal Blog: http://ksmx.me

Li.Ci

unread,
Mar 4, 2011, 10:42:56 PM3/4/11
to fcctt, fullcircle_clt
我修改了程序,现在可以自动分析有多少页,并把sla中的文本内容按页分割开

我测试第46期,还可以,希望有更多测试

python 2.x 系列 无需第三方库



#coding:utf-8

import HTMLParser
import sys

class SLAParser(HTMLParser.HTMLParser):
   
    data=[]
    flag=False
    handledtags=['pageobject','itext']
    taglevels=[]
    atext = ''
    position = -1
   
    def __init__(self):       
        HTMLParser.HTMLParser.__init__(self)
   
    def handle_starttag(self,tag,attrs):
        if len(self.taglevels) and self.taglevels[-1]==tag :         
            self.handle_endtag(tag)
           
       
        self.taglevels.append(tag) 
        if tag== 'document' :
            for key,value in attrs:
                if key == 'anzpages':
                    self.data=['']*int(value)
           
        if tag == 'pageobject':
            for key,value in attrs:
                if key=='ownpage':
                    self.position=int(value)

                   
        if tag == 'itext':
            for key,value in attrs:
                if key == 'ch':
                    self.atext=self.atext+value
                   
    def handle_endtag(self,tag):
        if not tag in self.taglevels:
            return
       
        while len(self.taglevels):
            starttag=self.taglevels.pop()
            if tag == 'pageobject' and len(self.atext) and self.position>-1:              
                self.data[self.position]=self.data[self.position]+self.atext
                self.atext=''
           
            if starttag == tag:
                break

           
reload(sys)
sys.setdefaultencoding('utf-8')
slap = SLAParser()
fd = open(sys.argv[1])
#fd = open('/home/kofxx/test.sla')
slap.feed(fd.read())
fd.close()
print '共%s页' % len(slap.data)
pageno=0
for text in slap.data:
    pageno+=1
    print '第%s页' % pageno
    print text
    raw_input('敲回车键继续……')
 
Reply all
Reply to author
Forward
0 new messages