Commit c7cb3ee2 by henry

添加onlyoffice服务

1 parent 9984e414
......@@ -27,6 +27,11 @@
<version>1.0-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
package org.arch.office.utils;
import com.benjaminwan.ocrlibrary.OcrResult;
import io.github.mymonstercat.Model;
import io.github.mymonstercat.ocr.InferenceEngine;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.URL;
/**
* 识别图片文字工具类
*/
@Slf4j
public class OCRUtils {
/**
* 通过你本地图片获取图片内容
*
* @param path 本地图片地址
* @return
* @throws Exception
*/
public static String getLocalImageContent(String path) throws Exception {
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V4);
File imgFile = new File(path);
OcrResult ocrResult = engine.runOcr(imgFile.getPath());
return ocrResult.getStrRes().trim();
}
/**
* 通过网络图片获取图片内容
*
* @param httpUrl 网络图片地址
* @return
* @throws Exception
*/
public static String getNetImageContent(String httpUrl) throws Exception {
URL imageUrl = new URL(httpUrl);
File imgFile = new File("image.png");
FileUtils.copyURLToFile(imageUrl, imgFile);
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V4);
OcrResult ocrResult = engine.runOcr(imgFile.getPath());
return ocrResult.getStrRes().trim();
}
/**
* 通过图片流获取图片中的内容
*
* @param imageStream 图片流
* @return
* @throws Exception
*/
public static String getIoImageContent(InputStream imageStream) throws Exception {
BufferedImage bufferedImage = ImageIO.read(imageStream);
// 创建临时图片文件
File imgFile = File.createTempFile("tempImage", ".png");
ImageIO.write(bufferedImage, "png", imgFile);
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V4);
OcrResult ocrResult = engine.runOcr(imgFile.getPath());
if (imgFile.exists()) {
imgFile.delete();
log.info("删除临时文件");
}
return ocrResult.getStrRes().trim();
}
public static void main(String[] args) throws Exception {
//1:通过本地文件获取图片内容
String localImage = "C:\\Users\\hepen\\Desktop\\word校验\\c58b400e2b574243a1b3248d0a5d43ea\\5b7a58e73c444e22beee48a19421e977-27.png";
System.out.println(getLocalImageContent(localImage));
//2:通过网络文件获取图片内容
String imageUrl = "http://www.yangguangqin.com/uploads/image/20200616/1592298653190882.png";
System.out.println(getNetImageContent(imageUrl));
//3:通过网络流获取图片内容
File file = new File(localImage);
InputStream inputStream = new FileInputStream(file);
String ioImageContent = getIoImageContent(inputStream);
System.out.println(ioImageContent);
}
}
package org.arch.office.utils;
import com.eadc.entity.vo.UpLoadVO;
import com.eadc.service.OssService;
import com.eadc.utils.FileUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.fileupload.FileItem;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.util.Units;
import org.apache.poi.xwpf.usermodel.*;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.multipart.commons.CommonsMultipartFile;
import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* 解析word工具类
*/
@Service
@Slf4j
public class ParseWordUtils {
public static ParseWordUtils parseUtils;
@Resource
private OssService ossService;
@PostConstruct
public void init() {
parseUtils = this;
parseUtils.ossService = this.ossService;
}
/**
* 通过指定关键字截取word中输入的两个关键字之间的段落,并将改中间的段落复制到另外一个新的word中
* @param fis 源文件流
* @param keywordStart 开始关键字
* @param keywordEnd 结束关键字
* @param ignorePage 忽略的页面
*/
public static UpLoadVO copyContentByKeywordRange(InputStream fis,
String keywordStart,
String keywordEnd,
Integer ignorePage) throws IOException {
ZipSecureFile.setMinInflateRatio(-1.0d);
File temp = File.createTempFile("temp-", ".docx");
try (
FileOutputStream fos = new FileOutputStream(temp)) {
XWPFDocument sourceDocument = new XWPFDocument(fis);
XWPFDocument targetDocument = new XWPFDocument();
boolean isCopying = false;
if(null==ignorePage){
ignorePage = 0;
}
int pageNum = 0;
for (IBodyElement element : sourceDocument.getBodyElements()) {
if (element instanceof XWPFParagraph) {
// 计算页数
pageNum += 1;
if (pageNum < ignorePage) {
continue;
}
XWPFParagraph paragraph = (XWPFParagraph) element;
String text = paragraph.getText();
if (text.contains(keywordStart)) {
isCopying = true;
}
if (isCopying) {
if (paragraph.getCTP().getPPr() != null && paragraph.getCTP().getPPr().getTabs() != null) {
XWPFParagraph newParagraph = targetDocument.createParagraph();
newParagraph.getCTP().setPPr(paragraph.getCTP().getPPr());
extracted(paragraph, newParagraph);
} else {
XWPFParagraph newParagraph = targetDocument.createParagraph();
extracted(paragraph, newParagraph);
}
}
if (text.contains(keywordEnd)) {
isCopying = false;
}
} else if (element instanceof XWPFTable && isCopying) {
XWPFTable table = (XWPFTable) element;
XWPFTable newTable = targetDocument.createTable();
newTable.getCTTbl().set(table.getCTTbl());
} else if (element instanceof XWPFChart) {
if (isCopying) {
XWPFChart chart = (XWPFChart) element;
XWPFChart newChart = targetDocument.createChart();
newChart.getCTChart().set(chart.getCTChart().copy());
}
}
}
targetDocument.write(fos);
FileItem fileItem = FileUtils.createFileItem(temp);
MultipartFile multipartFile = new CommonsMultipartFile(fileItem);
UpLoadVO upload = parseUtils.ossService.upload(multipartFile);
log.info("获取上传文件的id为:{}",upload.getFileId());
//删除临时文件
temp.delete();
return upload;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
private static void extracted(XWPFParagraph paragraph, XWPFParagraph newParagraph) throws Exception {
for (XWPFRun run : paragraph.getRuns()) {
XWPFRun newRun = newParagraph.createRun();
newRun.setText(run.getText(0));
newRun.setBold(run.isBold());
newRun.setItalic(run.isItalic());
for (XWPFPicture picture : run.getEmbeddedPictures()) {
byte[] pictureData = picture.getPictureData().getData();
int pictureType = picture.getPictureData().getPictureType();
newRun.addPicture(new ByteArrayInputStream(pictureData), pictureType, "Copied Image",
Units.toEMU(400), Units.toEMU(400));
}
}
}
/**
* 通过关键字获取包含关键字的行
* @param pdfPath
* @param searchKeyWord
* @return
* @throws IOException
*/
public static List<String> parsePdf(String pdfPath,String searchKeyWord) throws IOException {
List<String> arrList = new ArrayList<>();
PDDocument document = null;
try {
File file = new File(pdfPath);
document = PDDocument.load(file);
PDFTextStripper pdfStripper = new PDFTextStripper();
String text = pdfStripper.getText(document);
String[] lines = text.split(System.lineSeparator()); // 将文本内容按行分割成数组
String keyword = searchKeyWord; // 指定关键字
for (String line : lines) {
if (line.contains(keyword)) { // 检查每行是否包含指定关键字
log.info("查询指定的关键字的行为:{}",line);
arrList.add(line);
}
}
return arrList;
} catch (IOException e) {
e.printStackTrace();
} finally {
if(null!=document){
document.close();
}
}
return null;
}
public static void main(String[] args) throws Exception {
String filePath="C:\\Users\\hepen\\Desktop\\word校验\\概要设计.docx";
File file = new File("F:\\test-word\\test1.docx");
InputStream inputStream1 = new FileInputStream(file);
InputStream inputStream2 = new FileInputStream(new File(filePath));
//copyContentByKeywordRange(inputStream, "1.监督评价考核", "2.日常运营",10);
}
}
package org.arch.office.utils;
import cn.hutool.core.lang.Assert;
import com.eadc.modules.system.service.dto.File;
import com.eadc.modules.system.service.mapper.FileMapper;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.dromara.x.file.storage.core.Downloader;
import org.dromara.x.file.storage.core.FileStorageService;
import org.springframework.stereotype.Service;
import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import java.io.ByteArrayInputStream;
import java.io.IOException;
/**
* 解析pdf工具类【对于扫描版的pdf是无法解析的】
*/
@Service
public class PrasePdfUtils {
public static PrasePdfUtils prasePdfUtils;
@Resource
private FileStorageService fileStorageService;
@Resource
private FileMapper sysFileMapper;
@PostConstruct
public void init() {
prasePdfUtils = this;
prasePdfUtils.fileStorageService = this.fileStorageService;
prasePdfUtils.sysFileMapper = this.sysFileMapper;
}
public static String prasePdf(String fileId) throws IOException {
PDDocument document = null;
ByteArrayInputStream byteArrayInputStream = null;
String text = "";
try{
File file = prasePdfUtils.sysFileMapper.selectById(fileId);
Assert.notNull(file, "文件不存在");
Downloader download = prasePdfUtils.fileStorageService.download(file.getUrl());
byte[] bytes = download.bytes();
byteArrayInputStream = new ByteArrayInputStream(bytes);
document = PDDocument.load(byteArrayInputStream);
// 创建PDFTextStripper对象并从文档中提取文本
PDFTextStripper pdfStripper = new PDFTextStripper();
text = pdfStripper.getText(document);
return text;
}catch (Exception e){
e.printStackTrace();
} finally {
if(null!=document){
document.close();
}
if(null!=byteArrayInputStream){
byteArrayInputStream.close();
}
}
return null;
}
}
package org.arch.office.utils;
import com.microsoft.schemas.office.office.CTOLEObject;
import com.microsoft.schemas.vml.CTShape;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObject;
import org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* ClassName: XWPFUtils
* Function: TODO
* Date: 2020/2/12 0012 21:09
* author XieWenYing
* version V1.0
*/
public class XWPFUtils {
/**
* 获取某一段落中图片和对象的索引
* @param paragraph
* @return
*/
public static Map<String, List<String>> readAttachInParagraph(XWPFParagraph paragraph) {
//图片索引和Object索引获取map
HashMap map = new HashMap<>();
//图片索引List
List<String> imageBundleList = new ArrayList<>();
//Object索引List
ArrayList<String> objectBundleList = new ArrayList<>();
List<XWPFRun> runList = paragraph.getRuns();
for (XWPFRun run : runList) {
CTR ctr = run.getCTR();
//对子元素进行遍历
XmlCursor xmlCursor = ctr.newCursor();
//拿到所有子元素
xmlCursor.selectPath("./*");
while (xmlCursor.toNextSelection()) {
XmlObject o = xmlCursor.getObject();
//如果子元素是<w:drawing>这样的形式, 使用CTDrawing保存图片
if (o instanceof CTDrawing) {
CTDrawing drawing = (CTDrawing) o;
List<CTInline> ctInlines = drawing.getInlineList();
for (CTInline inline : ctInlines) {
CTGraphicalObject graphic = inline.getGraphic();
XmlCursor cursor = graphic.getGraphicData().newCursor();
cursor.selectPath("./*");
while (cursor.toNextSelection()) {
XmlObject object = cursor.getObject();
//如果子元素是<pic:pic>这种形式
if (object instanceof CTPicture) {
CTPicture picture = (CTPicture) object;
//拿到元素的属性
imageBundleList.add(picture.getBlipFill().getBlip().getEmbed());
}
}
}
}
//使用CTObject保存图片
//<w:object>形式
if (o instanceof CTObject) {
CTObject object = (CTObject) o;
XmlCursor cursor = object.newCursor();
cursor.selectPath("./*");
CTShape shape;
CTOLEObject oleObject;
while (cursor.toNextSelection()) {
XmlObject xmlObject = cursor.getObject();
//如果是图片类型,存图片id
if (xmlObject instanceof CTShape) {
shape = (CTShape) xmlObject;
imageBundleList.add(shape.getImagedataArray(0).getId2());
}
//如果是嵌入对象类型,存对象id
if (xmlObject instanceof CTOLEObject) {
oleObject = (CTOLEObject) xmlObject;
objectBundleList.add(oleObject.getId());
}
}
}
}
}
map.put("img", imageBundleList);
map.put("object", objectBundleList);
return map;
}
/**
* 获取某一段落的大纲级别
* @param document
* @param paragraph
* @return
*/
public static BigInteger getParaOutlineLvl(XWPFDocument document, XWPFParagraph paragraph) {
XWPFStyles styles = document.getStyles();
XWPFStyle style = styles.getStyle(paragraph.getStyle());
//判断该段落是否设置了大纲级别
if (paragraph.getCTP().getPPr().getOutlineLvl() != null) {
//System.out.println(paragraph.getParagraphText());
//System.out.println(paragraph.getCTP().getPPr().getOutlineLvl().getVal());
return paragraph.getCTP().getPPr().getOutlineLvl().getVal();
//判断该段落的样式是否设置了大纲级别
} else if (style != null && style.getCTStyle().getPPr().getOutlineLvl() != null) {
//System.out.println(paragraph.getParagraphText());
//System.out.println(style.getCTStyle().getPPr().getOutlineLvl().getVal());
return style.getCTStyle().getPPr().getOutlineLvl().getVal();
//判断该段落的基础样式是否设置了大纲级别
} else if (style != null && style.getCTStyle()!=null && style.getCTStyle().getBasedOn()!= null&&
styles.getStyle(style.getCTStyle().getBasedOn().getVal()).getCTStyle().getPPr().getOutlineLvl() != null) {
//System.out.println(paragraph.getParagraphText());
String styledName = style.getCTStyle().getBasedOn().getVal();
//System.out.println(styles.getStyle(styledName).getCTStyle().getPPr().getOutlineLvl().getVal());
return styles.getStyle(styledName).getCTStyle().getPPr().getOutlineLvl().getVal();
//没有设置大纲级别
} else {
//System.out.println(paragraph.getParagraphText()+"==");
return null;
}
}
}
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!