1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | import glob import os import fnmatch import shutil import sys def iterfindfiles(path, fnexp): for root, dirs, files in os.walk(path): for filename in fnmatch.filter(files, fnexp): yield os.path.join(root, filename) i=0 for filename in iterfindfiles(r"./input/", "*.zip"): i=i+1 newfilename = "zip/" + str(i) + "_" + os.path.basename(filename) print filename + " <===> " + newfilename shutil.move(filename, newfilename) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | i=0; for file in `ls`; do mkdir output/${i}; echo "unzip $file -d output/${i}";unzip -P abc $file -d output/${i} > /dev/null; ((i++)); done i=0; for file in `ls`; do mkdir output/${i}; echo "${i} unrar x $file output/${i}";unrar x $file output/${i} > /dev/null; ((i++)); done ``` ### 第四步:srt、ass、ssa字幕文件分类整理 当你下载大量字幕并解压后你会发现字幕文件类型有很多种,包括srt、lrc、ass、ssa、sup、idx、str、vtt,但是整体量级上来看srt、ass、ssa占绝对优势,因此简单起见,我们抛弃掉其他格式,只保留这三种,具体分类整理的脚本可以参考第二部压缩格式分类的方法按扩展名整理 ### 第五步:清理目录 在我边整理边分析的过程中发现,我为了避免重名把文件放到不同目录里后,如果再经过一步文件类型整理,会产生非常多的空目录,每次ls都要拉好几屏,所以写了一个自动清理空目录的脚本clear_empty_dir.py,如下: ```python import glob import os import fnmatch import shutil import sys def iterfindfiles(path, fnexp): for root, dirs, files in os.walk(path): if 0 == len(files) and len(dirs) == 0: print root os.rmdir(root) iterfindfiles(r"./input/", "") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | import glob import os import fnmatch import shutil import sys def iterfindfiles(path, fnexp): for root, dirs, files in os.walk(path): for filename in fnmatch.filter(files, fnexp): yield os.path.join(root, filename) for suffix in ("*.mp4", "*.txt", "*.JPG", "*.htm", "*.doc", "*.docx", "*.nfo", "*.sub", "*.idx"): for filename in iterfindfiles(r"./input/", suffix): print filename os.remove(filename) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import chardet import sys import os if __name__ == '__main__': if len(sys.argv) == 2: for root, dirs, files in os.walk(sys.argv[1]): for file in files: file_path = root + "/" + file f = open(file_path,'r') data = f.read() f.close() encoding = chardet.detect(data)["encoding"] if encoding not in ("UTF-8-SIG", "UTF-16LE", "utf-8", "ascii"): try: gb_content = data.decode("gb18030") gb_content.encode('utf-8') f = open(file_path, 'w') f.write(gb_content.encode('utf-8')) f.close() except: print "except:", file_path |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | # coding:utf-8 import chardet import os import re cn=ur"([\u4e00-\u9fa5]+)" pattern_cn = re.compile(cn) jp1=ur"([\u3040-\u309F]+)" pattern_jp1 = re.compile(jp1) jp2=ur"([\u30A0-\u30FF]+)" pattern_jp2 = re.compile(jp2) for root, dirs, files in os.walk("./srt"): file_count = len(files) if file_count > 0: for index, file in enumerate(files): f = open(root + "/" + file, "r") content = f.read() f.close() encoding = chardet.detect(content)["encoding"] try: for sentence in content.decode(encoding).split('\n'): if len(sentence) > 0: match_cn = pattern_cn.findall(sentence) match_jp1 = pattern_jp1.findall(sentence) match_jp2 = pattern_jp2.findall(sentence) sentence = sentence.strip() if len(match_cn)>0 and len(match_jp1)==0 and len(match_jp2) == 0 and len(sentence)>1 and len(sentence.split(' ')) < 10: print sentence.encode('utf-8') except: continue |
1 2 3 4 5 6 | if line.find('Dialogue') == 0 and len(line) < 500: fields = line.split(',') sentence = fields[len(fields)-1] tag_fields = sentence.split('}') if len(tag_fields) > 1: sentence = tag_fields[len(tag_fields)-1] |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | # coding:utf-8 import sys import re import chardet if __name__ == '__main__': #illegal=ur"([\u2000-\u2010]+)" illegal=ur"([\u0000-\u2010]+)" pattern_illegals = [re.compile(ur"([\u2000-\u2010]+)"), re.compile(ur"([\u0090-\u0099]+)")] filters = ["字幕", "时间轴:", "校对:", "翻译:", "后期:", "监制:"] filters.append("时间轴:") filters.append("校对:") filters.append("翻译:") filters.append("后期:") filters.append("监制:") filters.append("禁止用作任何商业盈利行为") filters.append("http") htmltagregex = re.compile(r'<[^>]+>',re.S) brace_regex = re.compile(r'\{.*\}',re.S) slash_regex = re.compile(r'\\\w',re.S) repeat_regex = re.compile(r'[-=]{10}',re.S) f = open("./corpus/all.out", "r") count=0 while True: line = f.readline() if line: line = line.strip() # 编码识别,不是utf-8就过滤 gb_content = '' try: gb_content = line.decode("utf-8") except Exception as e: sys.stderr.write("decode error: ", line) continue # 中文识别,不是中文就过滤 need_continue = False for pattern_illegal in pattern_illegals: match_illegal = pattern_illegal.findall(gb_content) if len(match_illegal) > 0: sys.stderr.write("match_illegal error: %s\n" % line) need_continue = True break if need_continue: continue # 关键词过滤 need_continue = False for filter in filters: try: line.index(filter) sys.stderr.write("filter keyword of %s %s\n" % (filter, line)) need_continue = True break except: pass if need_continue: continue # 去掉剧集信息 if re.match('.*第.*季.*', line): sys.stderr.write("filter copora %s\n" % line) continue if re.match('.*第.*集.*', line): sys.stderr.write("filter copora %s\n" % line) continue if re.match('.*第.*帧.*', line): sys.stderr.write("filter copora %s\n" % line) continue # 去html标签 line = htmltagregex.sub('',line) # 去花括号修饰 line = brace_regex.sub('', line) # 去转义 line = slash_regex.sub('', line) # 去重复 new_line = repeat_regex.sub('', line) if len(new_line) != len(line): continue # 去特殊字符 line = line.replace('-', '').strip() if len(line) > 0: sys.stdout.write("%s\n" % line) count+=1 else: break f.close() pass |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | 这是什么 是寄给医院的 井崎…为什么? 是为了小雪的事情 怎么回事? 您不记得了吗 在她说小雪… 就是在这种非常时期和我们一起舍弃休息时间来工作的护士失踪时… 医生 小雪她失踪了 你不是回了一句「是吗」吗 是吗… 不 对不起 跟我道歉也没用啊 而且我们都知道您是因为夫人的事情而操劳 但是 我想小聪是受不了医生一副漠不关心的样子 事到如今再责备医生也没有用了 是我的错吗… 我就是这个意思 您听不出来吗 我也难以接受 …… |