Spaces:
Runtime error
Runtime error
#!/usr/bin/env python | |
# -*- coding:utf-8 _*- | |
""" | |
@author:quincy qiang | |
@license: Apache Licence | |
@file: wiki_process.py | |
@time: 2023/04/19 | |
@contact: [email protected] | |
@software: PyCharm | |
@description: https://blog.csdn.net/weixin_40871455/article/details/88822290 | |
""" | |
import logging | |
import sys | |
from gensim.corpora import WikiCorpus | |
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO) | |
''' | |
extract data from wiki dumps(*articles.xml.bz2) by gensim. | |
@2019-3-26 | |
''' | |
def help(): | |
print("Usage: python wikipro.py zhwiki-20190320-pages-articles-multistream.xml.bz2 wiki.zh.txt") | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
help() | |
sys.exit(1) | |
logging.info("running %s" % ' '.join(sys.argv)) | |
inp, outp = sys.argv[1:3] | |
i = 0 | |
output = open(outp, 'w', encoding='utf8') | |
wiki = WikiCorpus(inp, dictionary={}) | |
for text in wiki.get_texts(): | |
output.write(" ".join(text) + "\n") | |
i = i + 1 | |
if (i % 10000 == 0): | |
logging.info("Save " + str(i) + " articles") | |
output.close() | |
logging.info("Finished saved " + str(i) + "articles") | |
# 命令行下运行 | |
# python wikipro.py cache/zh_wikipedia/zhwiki-latest-pages-articles.xml.bz2 wiki.zh.txt |