遇到的问题:找到了一篇文献的Tex文件,尝试使用翻译软件翻译成中文以便于阅读,但机翻会极其智障地把不该翻译的也给翻译了,比如
\begin{document}
给翻译成了
\开始{文档}
因此,尝试使用正则表达式将Tex中没有必要翻译的关键字与公式给替换掉
(1)将关键字替换掉
#将Tex文件的关键字及公式替换掉并保存下来,防止被翻译软件翻译掉
import os
import re
replaceDict=[
r'(\$\$[^\$]+\$\$)', #一对$$符号中间夹着的数学表达式
r'(\$[^\$]+\$)', #一对$符号中间夹着的数学表达式
r'(\\[a-zA-Z]+{[^{}]+})', #\xxx{xxx}形式的命令
r'(\\[a-zA-Z]+)' #\xxx形式的命令
]
expressDict=[
r'(\\begin{equation}|\\end{equation})',
r'(\\begin{equation\*}|\\end{equation\*})',
r'(\\begin{align}|\\end{align})',
r'(\\begin{align\*}|\\end{align\*})',
r'(\\begin{table}|\\end{table})',
r'(\\begin{table\*}|\\end{table\*})',
r'(\\\[|\\])',
r'([|])',
r'({|})',
]
# 注意,如果不在字符串前面使用r符号,则反斜杠会被Python解释器转义一次,再被正则表达式转义一次
# 注意,右中括号不需要转义
# 注意,当大括号内容不含数字时大括号不需要转义
# 吐槽,正则表达式的可读性和可维护性为0
replaceChar='の'#用于标记被替换掉的关键字的字符,最好是原文里没有且不会被翻译软件读取并改变的
splitChar='ん'#用于在存储关键字临时文件中分隔
inputFileName='main.tex'
outputFileName='out.tex'
tempFileName='temp.txt'
textContent=open(inputFileName,'r',encoding='utf-8').read()
replacedContent=[]
# 替换关键字部分
with open(tempFileName,'w',encoding='utf-8') as ftempout:
replaceNum=0
# 替换掉导言区
pattern=re.compile(r'(\\begin{document})')
s=pattern.split(textContent)
ftempout.write(s[0]+s[1]+splitChar)
s[0]=replaceChar+str(replaceNum)+replaceChar
s[1]=''
replaceNum=replaceNum+1
textContent=''.join(s)
# 替换掉数学表达式
for w in expressDict:
pattern=re.compile(w)
s=pattern.split(textContent)
# for j in range(len(s)):
# ftempout.write("\n%%%%\n"+s[j]+"\n%%%%\n")
# exit()
j=0
while j < len(s):
if pattern.match(s[j]) != None:
print(s[j]+s[j+1]+s[j+2])
ftempout.write(s[j]+s[j+1]+s[j+2]+splitChar)
replacedContent.append(s[j]+s[j+1]+s[j+2])
s[j]=replaceChar
s[j+1]=str(replaceNum)
s[j+2]=replaceChar
replaceNum=replaceNum+1
j=j+2
j=j+1
textContent=''.join(s)
# 注意,Python中的for in range()循环的循环变量无法像C++那样在循环中改变
for w in replaceDict:
pattern=re.compile(w)
s=pattern.split(textContent)
for j in range(len(s)):
if pattern.match(s[j]) != None:
print(s[j])
ftempout.write(s[j]+splitChar)
s[j]=replaceChar+str(replaceNum)+replaceChar
replaceNum=replaceNum+1
textContent=''.join(s)
with open(outputFileName,'w',encoding='utf-8') as fout:
fout.write(textContent)
(2)替换回来
#将Tex文件的关键字及公式替换掉并保存下来,防止被翻译软件翻译掉
import os
import re
replaceChar='の'#用于标记被替换掉的关键字的字符,最好是原文里没有且不会被翻译软件读取并改变的
splitChar='ん'#用于在存储关键字临时文件中分隔
inputFileName='main.tex'
outputFileName='out.tex'
translatedFileName='transed.tex'
tempFileName='temp.txt'
finalOutputFileName='out2.tex'
replacedContent=re.split(splitChar,open(tempFileName,'r',encoding='utf-8').read())
# print(replacedContent)
textContent=open(translatedFileName,'r',encoding='utf-8').read()
contentPattern='('+replaceChar+r'[0-9]+'+replaceChar+')'
pattern=pattern=re.compile(contentPattern)
while re.search(replaceChar,textContent)!= None:
s=re.split(contentPattern,textContent)
for j in range(len(s)):
if pattern.match(s[j])!= None:
# print(int(s[j][1:-1]),replacedContent[int(s[j][1:-1])])
s[j]=replacedContent[int(s[j][1:-1])]
textContent=' '.join(s) #不加空格的话关键字和文本可能会粘连在一起造成编译错误
with open(finalOutputFileName,'w',encoding='utf-8') as fout:
fout.write(textContent)
标签:replaceNum,replaceChar,pattern,textContent,tex,关键字,re,替换 From: https://www.cnblogs.com/isakovsky/p/17436830.html