这个是修改后的用于提取sla文件内容的程序,移除了对beautiful soup的依赖,只需要python3.x 即可执行。sla中的内容按页保存在'no-x'形式的文本文件中,跨越多页的文章以保存在以“起始页”命名的文件中。
python2.x 稍作修改,应该也可以执行,未测试.
#coding=utf-8
#-------------------------------------------------------------------------------
# Name: slafile
# Purpose:
#
# Author:
dysp...@gmail.com#
# Created: 07/11/2012
# Copyright: (c)
dysp...@gmail.com 2012
# Licence: <your licence>
#-------------------------------------------------------------------------------
import re
re_PageNum=re.compile('<DOCUMENT ANZPAGES="(.+?)"',re.I)
re_PageObjects='(<PAGEOBJECT OwnPage="%s".*?</PAGEOBJECT>)'
re_Line=re.compile('(<ITEXT.+?/>)',re.I)
re_Text=re.compile('ch="(.+?)"',re.I)
re_Title=re.compile('<ITEXT.+?ch="(.+?)"[.\s\S]+?<trail PARENT="Article Title"/>',re.I)
re_CH=re.compile('ch=".+?"',re.I)
class SlaFile(object):
data=''
def __init__(self,f):
with open(f,encoding='utf-8') as slaf:
self.data=slaf.read()
def getPageNums(self):
n=re.findall(re_PageNum,self.data)
if n==[]:return -1
return n[0]
def getPageObjects(self,n):
re_s='(<PAGEOBJECT OwnPage="%s"[.\s\S]+?</PAGEOBJECT>)' % n
re_po=re.compile(re_s,re.I)
return re.findall(re_po,self.data)
def getObjectLines(self,po):
lines=[]
n=re.findall(re_Line,po)
if n!=[]:lines=lines+n
return lines
def getPageLines(self,n):
plines=[]
pos=self.getPageObjects(n)
for obs in pos:
lines=self.getObjectLines(obs)
if lines!=[]:
plines=plines+lines
return plines
def getArticTitle(self,ob):
a=re.findall(re_Title,ob)
if a!=[]:
return a[0]
return None
def getLineText(self,line):
text=re.findall(re_Text,line)
if text==[]:
return None
else:
return text[0]
def getPageContent(self,n):
content=[]
content.append('No-%s\n' % n)
pos=self.getPageObjects(n)
for ob in pos:
flag=False
title=self.getArticTitle(ob)
if title!=None:
content.append('TITLE:'+title)
content.append('\n')
continue
lines=self.getObjectLines(ob)
for line in lines:
text=self.getLineText(line)
if text!=None:
content.append(text)
flag=True
if flag:content.append('\n')
return ''.join(content)
def saveArticle(self,n):
data=self.getPageContent(n)
filename='No-%d.txt' % n
with open (filename,'w',encoding='utf-8') as f:
f.write(data)
f.flush()
def main():
#调用的范例
#生成类对象,需要传递sla文件信息
sla=SlaFile('issue65.sla')
#获取总页码数
nums=int(sla.getPageNums())
for i in range(nums):
#保存第x页中的内容,内容保存在当前路径下
sla.saveArticle(i)
if __name__ == '__main__':
main()