wordUtils 解析 word

发布时间 2023-04-20 13:50:36作者: lshan

 

读取word 文本以及 图片 

 

依赖:

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>${poi.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>${poi.version}</version>
        </dependency>

 

 

WordUtils:


import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.junit.jupiter.api.Test;
import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/***************************
 *<pre>
 * @Project Name : sea-dep-service
 * @Package      : com.sea.x.common.utils
 * @File Name    : ReadDOCUtil
 * @Author       : Sea
 * @Mail         : lshan@523@163
 * @Date         : 2023/2/3 15:23
 * @Purpose      :
 * @History      :
 *</pre>
 ***************************/
@Slf4j
public class WordUtil {

    public  final  static  String  TEXT="text";
    public  final  static  String  IMAGE="image";


    /**
     * 读取文本内容
     * @param inputStream
     * @return
     */
    public static String readWord(InputStream inputStream) {
        return readWord(inputStream, false).getOrDefault(TEXT,"")+"";
    }

    public static String readWord(File file) {
        return readWord(file, false).getOrDefault(TEXT,"")+"";
    }

    /**
     * @param file
     * @param isImgRead
     * @return {"text":"...",  "image":{'fileName':'byte[]'}}
     */
    public static Map readWord(File file, boolean isImgRead) {
        Map<String, Object> result = new HashMap<>();
        try {
           result =readWord(new FileInputStream(file), isImgRead);
        } catch (FileNotFoundException e) {
            e.printStackTrace();
            log.error("parse word :{} error :{}",file.getName(), e);
        }
        return result;
    }

    /**
     * @param fis
     * @param isImgRead  是否获取图片数据
     * @return  {"text":"...",  "image":{'fileName':'byte[]'}}
     */
    public static Map<String, Object> readWord(InputStream fis, boolean isImgRead)
    {
        Map<String, Object> result = new HashMap<>();
        XWPFDocument document = null;
        XWPFWordExtractor extractor = null;
        try {
            document = new XWPFDocument(fis);
            //文件名, byte[]
            HashMap<String, byte[]> picData =null;
            if(isImgRead){
                picData = new HashMap<>();
                List<XWPFPictureData> allPictures = document.getAllPackagePictures();
                for(XWPFPictureData  p : allPictures)
                { //获取简历中个人图片
                    byte[] data = p.getData();
                    picData.put(data.length+"_"+p.getFileName(),data);
//                    IOUtils.write(p.getData(), new FileOutputStream("C:\\Users\\Sea\\Desktop\\pic\\"+p.getFileName()+""));
                }
            }
            extractor = new XWPFWordExtractor(document);
            String text = extractor.getText();
            result.put(TEXT,text);
            result.put(IMAGE,picData);
        } catch (Exception e) {
            e.printStackTrace();
            log.error("parse word  error :{}", e);
        }finally {
                try {
                     extractor.close();
                     document.close();
                } catch (IOException e) {

                }
        }
        return  result;
    }




    static Pattern  emailPattern = Pattern.compile("[a-zA-Z0-9]+@[a-zA-Z0-9]+\\.[a-zA-Z0-9]+");
    /**
     *  以姓名: or  姓 名:开头, 空格结尾
     */
    static Pattern  namePattern = Pattern.compile("(姓.*?名)(.*?)( )");
    static Pattern  phonePattern = Pattern.compile("1(3\\d|4[5-9]|5[0-35-9]|6[567]|7[0-8]|8\\d|9[0-35-9])\\d{8}");

    /**
     * 提取部分信息
     * @param text
     * @return
     */
    private static Map<String,String> getMainInfo(String text){
        Matcher nameMatcher = namePattern.matcher(text);
        Matcher emailMatcher = emailPattern.matcher(text);
        Matcher phoneMatcher = phonePattern.matcher(text);
        String userName = (nameMatcher.find()?nameMatcher.group(2):"").replace("","").replace(":","");
        String email = emailMatcher.find()?emailMatcher.group(0):"";
        String phone = phoneMatcher.find()?phoneMatcher.group(0):"";
        userName =  StringUtils.isNotBlank(userName)?userName : text.trim().substring(0, 10).replace("\n"," ").replace("\t"," ").split(" ")[0];
        String sex = text.contains("")?"0":"1";
        System.err.println("name : "+ userName);
        System.err.println("email : "+ email);
        System.err.println("phone : "+ phone);
        System.err.println("sex : "+ sex);
        String lastUserName = userName;
        return new HashMap<String,String>(){{
            put("sex",sex);
            put("userName", lastUserName);
            put("email",email);
            put("phone",phone);
        }};

    }


@Test
public void testName() throws Exception
{
    String fileName = "C:/Users/Sea/Downloads/job_word_20230202/sea工程师46152253.docx";
    File file = new File(fileName);
    Map map = readWord(file, true);
    String txt = map.get(TEXT)+"";
    getMainInfo(txt);

    System.err.println(txt);
    Map<String,byte[]> imgs =(Map<String,byte[]>) map.get(IMAGE);

    imgs.forEach((fn,bt)->{
        try {
            FileOutputStream fileOutputStream = new FileOutputStream(fn);
            IOUtils.write(bt,fileOutputStream);
        } catch (Exception e) {
            e.printStackTrace();
        }

    });
}




}