Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Administrator
/
framework-tools
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit c7cb3ee2
authored
Jul 08, 2024
by
henry
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
添加onlyoffice服务
1 parent
9984e414
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
6 additions
and
454 deletions
arch-clouds/office/pom.xml
arch-clouds/office/src/main/java/org/arch/office/utils/OCRUtils.java
arch-clouds/office/src/main/java/org/arch/office/utils/ParseWordUtils.java
arch-clouds/office/src/main/java/org/arch/office/utils/PrasePdfUtils.java
arch-clouds/office/src/main/java/org/arch/office/utils/XWPFUtils.java
arch-clouds/office/pom.xml
View file @
c7cb3ee
...
...
@@ -27,6 +27,11 @@
<version>
1.0-SNAPSHOT
</version>
<scope>
compile
</scope>
</dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<artifactId>
httpclient
</artifactId>
</dependency>
</dependencies>
</project>
\ No newline at end of file
arch-clouds/office/src/main/java/org/arch/office/utils/OCRUtils.java
deleted
100644 → 0
View file @
9984e41
package
org
.
arch
.
office
.
utils
;
import
com.benjaminwan.ocrlibrary.OcrResult
;
import
io.github.mymonstercat.Model
;
import
io.github.mymonstercat.ocr.InferenceEngine
;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.commons.io.FileUtils
;
import
javax.imageio.ImageIO
;
import
java.awt.image.BufferedImage
;
import
java.io.File
;
import
java.io.FileInputStream
;
import
java.io.InputStream
;
import
java.net.URL
;
/**
* 识别图片文字工具类
*/
@Slf4j
public
class
OCRUtils
{
/**
* 通过你本地图片获取图片内容
*
* @param path 本地图片地址
* @return
* @throws Exception
*/
public
static
String
getLocalImageContent
(
String
path
)
throws
Exception
{
InferenceEngine
engine
=
InferenceEngine
.
getInstance
(
Model
.
ONNX_PPOCR_V4
);
File
imgFile
=
new
File
(
path
);
OcrResult
ocrResult
=
engine
.
runOcr
(
imgFile
.
getPath
());
return
ocrResult
.
getStrRes
().
trim
();
}
/**
* 通过网络图片获取图片内容
*
* @param httpUrl 网络图片地址
* @return
* @throws Exception
*/
public
static
String
getNetImageContent
(
String
httpUrl
)
throws
Exception
{
URL
imageUrl
=
new
URL
(
httpUrl
);
File
imgFile
=
new
File
(
"image.png"
);
FileUtils
.
copyURLToFile
(
imageUrl
,
imgFile
);
InferenceEngine
engine
=
InferenceEngine
.
getInstance
(
Model
.
ONNX_PPOCR_V4
);
OcrResult
ocrResult
=
engine
.
runOcr
(
imgFile
.
getPath
());
return
ocrResult
.
getStrRes
().
trim
();
}
/**
* 通过图片流获取图片中的内容
*
* @param imageStream 图片流
* @return
* @throws Exception
*/
public
static
String
getIoImageContent
(
InputStream
imageStream
)
throws
Exception
{
BufferedImage
bufferedImage
=
ImageIO
.
read
(
imageStream
);
// 创建临时图片文件
File
imgFile
=
File
.
createTempFile
(
"tempImage"
,
".png"
);
ImageIO
.
write
(
bufferedImage
,
"png"
,
imgFile
);
InferenceEngine
engine
=
InferenceEngine
.
getInstance
(
Model
.
ONNX_PPOCR_V4
);
OcrResult
ocrResult
=
engine
.
runOcr
(
imgFile
.
getPath
());
if
(
imgFile
.
exists
())
{
imgFile
.
delete
();
log
.
info
(
"删除临时文件"
);
}
return
ocrResult
.
getStrRes
().
trim
();
}
public
static
void
main
(
String
[]
args
)
throws
Exception
{
//1:通过本地文件获取图片内容
String
localImage
=
"C:\\Users\\hepen\\Desktop\\word校验\\c58b400e2b574243a1b3248d0a5d43ea\\5b7a58e73c444e22beee48a19421e977-27.png"
;
System
.
out
.
println
(
getLocalImageContent
(
localImage
));
//2:通过网络文件获取图片内容
String
imageUrl
=
"http://www.yangguangqin.com/uploads/image/20200616/1592298653190882.png"
;
System
.
out
.
println
(
getNetImageContent
(
imageUrl
));
//3:通过网络流获取图片内容
File
file
=
new
File
(
localImage
);
InputStream
inputStream
=
new
FileInputStream
(
file
);
String
ioImageContent
=
getIoImageContent
(
inputStream
);
System
.
out
.
println
(
ioImageContent
);
}
}
arch-clouds/office/src/main/java/org/arch/office/utils/ParseWordUtils.java
deleted
100644 → 0
View file @
9984e41
package
org
.
arch
.
office
.
utils
;
import
com.eadc.entity.vo.UpLoadVO
;
import
com.eadc.service.OssService
;
import
com.eadc.utils.FileUtils
;
import
lombok.extern.slf4j.Slf4j
;
import
org.apache.commons.fileupload.FileItem
;
import
org.apache.pdfbox.pdmodel.PDDocument
;
import
org.apache.pdfbox.text.PDFTextStripper
;
import
org.apache.poi.openxml4j.util.ZipSecureFile
;
import
org.apache.poi.util.Units
;
import
org.apache.poi.xwpf.usermodel.*
;
import
org.springframework.stereotype.Service
;
import
org.springframework.web.multipart.MultipartFile
;
import
org.springframework.web.multipart.commons.CommonsMultipartFile
;
import
javax.annotation.PostConstruct
;
import
javax.annotation.Resource
;
import
java.io.*
;
import
java.util.ArrayList
;
import
java.util.List
;
/**
* 解析word工具类
*/
@Service
@Slf4j
public
class
ParseWordUtils
{
public
static
ParseWordUtils
parseUtils
;
@Resource
private
OssService
ossService
;
@PostConstruct
public
void
init
()
{
parseUtils
=
this
;
parseUtils
.
ossService
=
this
.
ossService
;
}
/**
* 通过指定关键字截取word中输入的两个关键字之间的段落,并将改中间的段落复制到另外一个新的word中
* @param fis 源文件流
* @param keywordStart 开始关键字
* @param keywordEnd 结束关键字
* @param ignorePage 忽略的页面
*/
public
static
UpLoadVO
copyContentByKeywordRange
(
InputStream
fis
,
String
keywordStart
,
String
keywordEnd
,
Integer
ignorePage
)
throws
IOException
{
ZipSecureFile
.
setMinInflateRatio
(-
1.0d
);
File
temp
=
File
.
createTempFile
(
"temp-"
,
".docx"
);
try
(
FileOutputStream
fos
=
new
FileOutputStream
(
temp
))
{
XWPFDocument
sourceDocument
=
new
XWPFDocument
(
fis
);
XWPFDocument
targetDocument
=
new
XWPFDocument
();
boolean
isCopying
=
false
;
if
(
null
==
ignorePage
){
ignorePage
=
0
;
}
int
pageNum
=
0
;
for
(
IBodyElement
element
:
sourceDocument
.
getBodyElements
())
{
if
(
element
instanceof
XWPFParagraph
)
{
// 计算页数
pageNum
+=
1
;
if
(
pageNum
<
ignorePage
)
{
continue
;
}
XWPFParagraph
paragraph
=
(
XWPFParagraph
)
element
;
String
text
=
paragraph
.
getText
();
if
(
text
.
contains
(
keywordStart
))
{
isCopying
=
true
;
}
if
(
isCopying
)
{
if
(
paragraph
.
getCTP
().
getPPr
()
!=
null
&&
paragraph
.
getCTP
().
getPPr
().
getTabs
()
!=
null
)
{
XWPFParagraph
newParagraph
=
targetDocument
.
createParagraph
();
newParagraph
.
getCTP
().
setPPr
(
paragraph
.
getCTP
().
getPPr
());
extracted
(
paragraph
,
newParagraph
);
}
else
{
XWPFParagraph
newParagraph
=
targetDocument
.
createParagraph
();
extracted
(
paragraph
,
newParagraph
);
}
}
if
(
text
.
contains
(
keywordEnd
))
{
isCopying
=
false
;
}
}
else
if
(
element
instanceof
XWPFTable
&&
isCopying
)
{
XWPFTable
table
=
(
XWPFTable
)
element
;
XWPFTable
newTable
=
targetDocument
.
createTable
();
newTable
.
getCTTbl
().
set
(
table
.
getCTTbl
());
}
else
if
(
element
instanceof
XWPFChart
)
{
if
(
isCopying
)
{
XWPFChart
chart
=
(
XWPFChart
)
element
;
XWPFChart
newChart
=
targetDocument
.
createChart
();
newChart
.
getCTChart
().
set
(
chart
.
getCTChart
().
copy
());
}
}
}
targetDocument
.
write
(
fos
);
FileItem
fileItem
=
FileUtils
.
createFileItem
(
temp
);
MultipartFile
multipartFile
=
new
CommonsMultipartFile
(
fileItem
);
UpLoadVO
upload
=
parseUtils
.
ossService
.
upload
(
multipartFile
);
log
.
info
(
"获取上传文件的id为:{}"
,
upload
.
getFileId
());
//删除临时文件
temp
.
delete
();
return
upload
;
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
}
return
null
;
}
private
static
void
extracted
(
XWPFParagraph
paragraph
,
XWPFParagraph
newParagraph
)
throws
Exception
{
for
(
XWPFRun
run
:
paragraph
.
getRuns
())
{
XWPFRun
newRun
=
newParagraph
.
createRun
();
newRun
.
setText
(
run
.
getText
(
0
));
newRun
.
setBold
(
run
.
isBold
());
newRun
.
setItalic
(
run
.
isItalic
());
for
(
XWPFPicture
picture
:
run
.
getEmbeddedPictures
())
{
byte
[]
pictureData
=
picture
.
getPictureData
().
getData
();
int
pictureType
=
picture
.
getPictureData
().
getPictureType
();
newRun
.
addPicture
(
new
ByteArrayInputStream
(
pictureData
),
pictureType
,
"Copied Image"
,
Units
.
toEMU
(
400
),
Units
.
toEMU
(
400
));
}
}
}
/**
* 通过关键字获取包含关键字的行
* @param pdfPath
* @param searchKeyWord
* @return
* @throws IOException
*/
public
static
List
<
String
>
parsePdf
(
String
pdfPath
,
String
searchKeyWord
)
throws
IOException
{
List
<
String
>
arrList
=
new
ArrayList
<>();
PDDocument
document
=
null
;
try
{
File
file
=
new
File
(
pdfPath
);
document
=
PDDocument
.
load
(
file
);
PDFTextStripper
pdfStripper
=
new
PDFTextStripper
();
String
text
=
pdfStripper
.
getText
(
document
);
String
[]
lines
=
text
.
split
(
System
.
lineSeparator
());
// 将文本内容按行分割成数组
String
keyword
=
searchKeyWord
;
// 指定关键字
for
(
String
line
:
lines
)
{
if
(
line
.
contains
(
keyword
))
{
// 检查每行是否包含指定关键字
log
.
info
(
"查询指定的关键字的行为:{}"
,
line
);
arrList
.
add
(
line
);
}
}
return
arrList
;
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
finally
{
if
(
null
!=
document
){
document
.
close
();
}
}
return
null
;
}
public
static
void
main
(
String
[]
args
)
throws
Exception
{
String
filePath
=
"C:\\Users\\hepen\\Desktop\\word校验\\概要设计.docx"
;
File
file
=
new
File
(
"F:\\test-word\\test1.docx"
);
InputStream
inputStream1
=
new
FileInputStream
(
file
);
InputStream
inputStream2
=
new
FileInputStream
(
new
File
(
filePath
));
//copyContentByKeywordRange(inputStream, "1.监督评价考核", "2.日常运营",10);
}
}
arch-clouds/office/src/main/java/org/arch/office/utils/PrasePdfUtils.java
deleted
100644 → 0
View file @
9984e41
package
org
.
arch
.
office
.
utils
;
import
cn.hutool.core.lang.Assert
;
import
com.eadc.modules.system.service.dto.File
;
import
com.eadc.modules.system.service.mapper.FileMapper
;
import
org.apache.pdfbox.pdmodel.PDDocument
;
import
org.apache.pdfbox.text.PDFTextStripper
;
import
org.dromara.x.file.storage.core.Downloader
;
import
org.dromara.x.file.storage.core.FileStorageService
;
import
org.springframework.stereotype.Service
;
import
javax.annotation.PostConstruct
;
import
javax.annotation.Resource
;
import
java.io.ByteArrayInputStream
;
import
java.io.IOException
;
/**
* 解析pdf工具类【对于扫描版的pdf是无法解析的】
*/
@Service
public
class
PrasePdfUtils
{
public
static
PrasePdfUtils
prasePdfUtils
;
@Resource
private
FileStorageService
fileStorageService
;
@Resource
private
FileMapper
sysFileMapper
;
@PostConstruct
public
void
init
()
{
prasePdfUtils
=
this
;
prasePdfUtils
.
fileStorageService
=
this
.
fileStorageService
;
prasePdfUtils
.
sysFileMapper
=
this
.
sysFileMapper
;
}
public
static
String
prasePdf
(
String
fileId
)
throws
IOException
{
PDDocument
document
=
null
;
ByteArrayInputStream
byteArrayInputStream
=
null
;
String
text
=
""
;
try
{
File
file
=
prasePdfUtils
.
sysFileMapper
.
selectById
(
fileId
);
Assert
.
notNull
(
file
,
"文件不存在"
);
Downloader
download
=
prasePdfUtils
.
fileStorageService
.
download
(
file
.
getUrl
());
byte
[]
bytes
=
download
.
bytes
();
byteArrayInputStream
=
new
ByteArrayInputStream
(
bytes
);
document
=
PDDocument
.
load
(
byteArrayInputStream
);
// 创建PDFTextStripper对象并从文档中提取文本
PDFTextStripper
pdfStripper
=
new
PDFTextStripper
();
text
=
pdfStripper
.
getText
(
document
);
return
text
;
}
catch
(
Exception
e
){
e
.
printStackTrace
();
}
finally
{
if
(
null
!=
document
){
document
.
close
();
}
if
(
null
!=
byteArrayInputStream
){
byteArrayInputStream
.
close
();
}
}
return
null
;
}
}
arch-clouds/office/src/main/java/org/arch/office/utils/XWPFUtils.java
deleted
100644 → 0
View file @
9984e41
package
org
.
arch
.
office
.
utils
;
import
com.microsoft.schemas.office.office.CTOLEObject
;
import
com.microsoft.schemas.vml.CTShape
;
import
org.apache.poi.xwpf.usermodel.*
;
import
org.apache.xmlbeans.XmlCursor
;
import
org.apache.xmlbeans.XmlObject
;
import
org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObject
;
import
org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture
;
import
org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline
;
import
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing
;
import
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject
;
import
org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR
;
import
java.math.BigInteger
;
import
java.util.ArrayList
;
import
java.util.HashMap
;
import
java.util.List
;
import
java.util.Map
;
/**
* ClassName: XWPFUtils
* Function: TODO
* Date: 2020/2/12 0012 21:09
* author XieWenYing
* version V1.0
*/
public
class
XWPFUtils
{
/**
* 获取某一段落中图片和对象的索引
* @param paragraph
* @return
*/
public
static
Map
<
String
,
List
<
String
>>
readAttachInParagraph
(
XWPFParagraph
paragraph
)
{
//图片索引和Object索引获取map
HashMap
map
=
new
HashMap
<>();
//图片索引List
List
<
String
>
imageBundleList
=
new
ArrayList
<>();
//Object索引List
ArrayList
<
String
>
objectBundleList
=
new
ArrayList
<>();
List
<
XWPFRun
>
runList
=
paragraph
.
getRuns
();
for
(
XWPFRun
run
:
runList
)
{
CTR
ctr
=
run
.
getCTR
();
//对子元素进行遍历
XmlCursor
xmlCursor
=
ctr
.
newCursor
();
//拿到所有子元素
xmlCursor
.
selectPath
(
"./*"
);
while
(
xmlCursor
.
toNextSelection
())
{
XmlObject
o
=
xmlCursor
.
getObject
();
//如果子元素是<w:drawing>这样的形式, 使用CTDrawing保存图片
if
(
o
instanceof
CTDrawing
)
{
CTDrawing
drawing
=
(
CTDrawing
)
o
;
List
<
CTInline
>
ctInlines
=
drawing
.
getInlineList
();
for
(
CTInline
inline
:
ctInlines
)
{
CTGraphicalObject
graphic
=
inline
.
getGraphic
();
XmlCursor
cursor
=
graphic
.
getGraphicData
().
newCursor
();
cursor
.
selectPath
(
"./*"
);
while
(
cursor
.
toNextSelection
())
{
XmlObject
object
=
cursor
.
getObject
();
//如果子元素是<pic:pic>这种形式
if
(
object
instanceof
CTPicture
)
{
CTPicture
picture
=
(
CTPicture
)
object
;
//拿到元素的属性
imageBundleList
.
add
(
picture
.
getBlipFill
().
getBlip
().
getEmbed
());
}
}
}
}
//使用CTObject保存图片
//<w:object>形式
if
(
o
instanceof
CTObject
)
{
CTObject
object
=
(
CTObject
)
o
;
XmlCursor
cursor
=
object
.
newCursor
();
cursor
.
selectPath
(
"./*"
);
CTShape
shape
;
CTOLEObject
oleObject
;
while
(
cursor
.
toNextSelection
())
{
XmlObject
xmlObject
=
cursor
.
getObject
();
//如果是图片类型,存图片id
if
(
xmlObject
instanceof
CTShape
)
{
shape
=
(
CTShape
)
xmlObject
;
imageBundleList
.
add
(
shape
.
getImagedataArray
(
0
).
getId2
());
}
//如果是嵌入对象类型,存对象id
if
(
xmlObject
instanceof
CTOLEObject
)
{
oleObject
=
(
CTOLEObject
)
xmlObject
;
objectBundleList
.
add
(
oleObject
.
getId
());
}
}
}
}
}
map
.
put
(
"img"
,
imageBundleList
);
map
.
put
(
"object"
,
objectBundleList
);
return
map
;
}
/**
* 获取某一段落的大纲级别
* @param document
* @param paragraph
* @return
*/
public
static
BigInteger
getParaOutlineLvl
(
XWPFDocument
document
,
XWPFParagraph
paragraph
)
{
XWPFStyles
styles
=
document
.
getStyles
();
XWPFStyle
style
=
styles
.
getStyle
(
paragraph
.
getStyle
());
//判断该段落是否设置了大纲级别
if
(
paragraph
.
getCTP
().
getPPr
().
getOutlineLvl
()
!=
null
)
{
//System.out.println(paragraph.getParagraphText());
//System.out.println(paragraph.getCTP().getPPr().getOutlineLvl().getVal());
return
paragraph
.
getCTP
().
getPPr
().
getOutlineLvl
().
getVal
();
//判断该段落的样式是否设置了大纲级别
}
else
if
(
style
!=
null
&&
style
.
getCTStyle
().
getPPr
().
getOutlineLvl
()
!=
null
)
{
//System.out.println(paragraph.getParagraphText());
//System.out.println(style.getCTStyle().getPPr().getOutlineLvl().getVal());
return
style
.
getCTStyle
().
getPPr
().
getOutlineLvl
().
getVal
();
//判断该段落的基础样式是否设置了大纲级别
}
else
if
(
style
!=
null
&&
style
.
getCTStyle
()!=
null
&&
style
.
getCTStyle
().
getBasedOn
()!=
null
&&
styles
.
getStyle
(
style
.
getCTStyle
().
getBasedOn
().
getVal
()).
getCTStyle
().
getPPr
().
getOutlineLvl
()
!=
null
)
{
//System.out.println(paragraph.getParagraphText());
String
styledName
=
style
.
getCTStyle
().
getBasedOn
().
getVal
();
//System.out.println(styles.getStyle(styledName).getCTStyle().getPPr().getOutlineLvl().getVal());
return
styles
.
getStyle
(
styledName
).
getCTStyle
().
getPPr
().
getOutlineLvl
().
getVal
();
//没有设置大纲级别
}
else
{
//System.out.println(paragraph.getParagraphText()+"==");
return
null
;
}
}
}
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment