文本数据预处理(一)

发布时间 2023-09-20 20:18:23作者: 热爱工作的宁致桑
# 将所有txt文件拷贝至alltxt这个文件夹
import os
import shutil

# Create new folder if it doesn't exist
if not os.path.exists("alltxt"):
    os.makedirs("alltxt")

# Loop over all folders and subfolders
for root, dirs, files in os.walk("."):
    # Loop over files in current folder
    for file in files:
        # Check if file is .txt
        if file.endswith(".txt"):
            # Get source and destination paths
            src_path = os.path.join(root, file)
            dst_path = os.path.join("alltxt", file)
            # Check if file already exists in destination folder
            if not os.path.exists(dst_path):
                # Copy file to destination folder
                shutil.copy(src_path, dst_path)


# 删除所有小于1kB的文件
# Loop over all files in alltxt directory
for file in os.listdir("alltxt"):
    # Check if file is .txt
    if file.endswith(".txt"):
        # Get file size in bytes
        file_size = os.path.getsize(os.path.join("alltxt", file))
        # Check if file size is smaller than 1kB
        if file_size < 1024:
            # Delete file
            os.remove(os.path.join("alltxt", file))

            
# 删除所有含有招标/中标的txt文件
# Loop over all files in alltxt directory
for file in os.listdir("alltxt"):
    # Check if file is .txt and its name contains "招标" or "中标"
    if file.endswith(".txt") and ("招标" in file or "中标" in file):
        # Delete file
        os.remove(os.path.join("alltxt", file))

#
# Define list of prefixes
prefixes = ["(一)", "(二)", "(三)","(四)","(五)","(六)",
            "(一)", "(二)", "(三)","(四)","(五)","(六)",
            "一、", "二、", "三、", "四、", "五、", "六、", 
             "1.", "2.", "3.", "4.", "5.", "6.", 
            "1、", "2、", "3、","4、", "5、", "6、",
           "(1)", "(2)", "(3)","(4)", "(5)", "(6)"]

# Loop over all files in alltxt directory
for file in os.listdir("alltxt"):
    # Check if file is .txt
    if file.endswith(".txt"):
        # Read file contents
        with open(os.path.join("alltxt", file), "r") as f:
            contents = f.readlines()
        # Remove trailing newline character from each line, except for those that start with specific prefixes
        contents = [line.rstrip("\n") + "\n" if any(line.startswith(prefix) for prefix in prefixes) else line.rstrip("\n") for line in contents]
        # Write modified contents back to file
        with open(os.path.join("alltxt", file), "w") as f:
            f.writelines(contents)