|
@@ -0,0 +1,374 @@
|
|
|
+package com.zs.test;
|
|
|
+
|
|
|
+import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
|
+import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
+import com.opencsv.CSVWriter;
|
|
|
+import org.apache.commons.io.FileUtils;
|
|
|
+import org.apache.commons.lang3.RegExUtils;
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
|
+import org.apache.commons.lang3.math.NumberUtils;
|
|
|
+import org.assertj.core.util.Lists;
|
|
|
+import org.json.JSONException;
|
|
|
+import org.junit.jupiter.api.Test;
|
|
|
+import org.springframework.beans.BeanUtils;
|
|
|
+
|
|
|
+import javax.imageio.ImageIO;
|
|
|
+import java.awt.Color;
|
|
|
+import java.awt.Font;
|
|
|
+import java.awt.*;
|
|
|
+import java.awt.image.BufferedImage;
|
|
|
+import java.io.File;
|
|
|
+import java.io.FileWriter;
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.List;
|
|
|
+import java.util.*;
|
|
|
+
|
|
|
+public class ImagesTest {
|
|
|
+ @Test
|
|
|
+ public void testDrawImage() throws IOException {
|
|
|
+ BufferedImage oriImage = ImageIO.read(new File("D:\\data\\ocr\\招生报1.jpg"));
|
|
|
+ BufferedImage image = new BufferedImage(oriImage.getWidth(), oriImage.getHeight(), BufferedImage.TYPE_INT_RGB);
|
|
|
+ Graphics2D g2d = image.createGraphics();
|
|
|
+ g2d.setColor(Color.WHITE);
|
|
|
+ g2d.fillRect(0, 0, image.getWidth(), image.getHeight());
|
|
|
+ g2d.setColor(Color.BLACK);
|
|
|
+ // g2d.drawOval(25, 25, 150, 150);
|
|
|
+
|
|
|
+ g2d.setFont(new Font("宋体", Font.BOLD, 24));
|
|
|
+ g2d.drawString("69护理学", 18, 481);
|
|
|
+
|
|
|
+ g2d.dispose();
|
|
|
+ ImageIO.write(image, "png", new File("D:\\data\\ocr\\output_image.png"));
|
|
|
+ System.out.println("图片生成成功!");
|
|
|
+ }
|
|
|
+
|
|
|
+ @Test
|
|
|
+ public void testDrawImage2() throws IOException, JSONException {
|
|
|
+ String data = FileUtils.readFileToString(new File("D:\\data\\ocr\\8888\\20240418000944690_0001.json.txt"), "utf-8");
|
|
|
+ ObjectMapper om = new ObjectMapper();
|
|
|
+ om.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
|
+ OcrModel.OcrTableResult tableResult = om.readValue(data, OcrModel.OcrTableResult.class);
|
|
|
+ if (null == tableResult.tableNum || tableResult.tableNum <= 0) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ BufferedImage oriImage = ImageIO.read(new File("D:\\data\\ocr\\8888\\20240418000944690_0001.jpg"));
|
|
|
+ BufferedImage image = new BufferedImage(oriImage.getWidth(), oriImage.getHeight(), BufferedImage.TYPE_INT_RGB);
|
|
|
+ Graphics2D g2d = image.createGraphics();
|
|
|
+ g2d.setColor(Color.WHITE);
|
|
|
+ g2d.fillRect(0, 0, image.getWidth(), image.getHeight());
|
|
|
+ g2d.setColor(Color.BLACK);
|
|
|
+ // g2d.drawOval(25, 25, 150, 150);
|
|
|
+ g2d.setFont(new Font("宋体", Font.BOLD, 16));
|
|
|
+ for(OcrModel.OcrTableCells tbl : tableResult.tables) {
|
|
|
+ for (OcrModel.OcrTableCell cell : tbl.getCells()) {
|
|
|
+ OcrModel.OcrPoint pt = cell.getLocations().get(0);
|
|
|
+ g2d.drawString(cell.getWords(), pt.getX(), pt.getY());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ g2d.dispose();
|
|
|
+ ImageIO.write(image, "png", new File("D:\\data\\ocr\\test2.png"));
|
|
|
+ System.out.println("图片生成成功!");
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ @Test
|
|
|
+ /**
|
|
|
+ * 检查所有OCR结果的行列识别情况,判断目录内的数据是否完整
|
|
|
+ */
|
|
|
+ public void testExtractReport() throws IOException {
|
|
|
+ File targetDir = new File("D:\\data\\ocr\\1111out");
|
|
|
+ Set<String> ignoreFiles = new HashSet<>();
|
|
|
+ List<String[]> lineList = new ArrayList<>();
|
|
|
+ lineList.add(new String[]{"文件", "表格序号", "行数", "列数"});
|
|
|
+ for (File file : FileUtils.listFiles(targetDir, new String[]{"json"}, false)) {
|
|
|
+ String data = FileUtils.readFileToString(file, "utf-8");
|
|
|
+ ObjectMapper om = new ObjectMapper();
|
|
|
+ om.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
|
+ OcrModel.OcrTableResult tableResult = om.readValue(data, OcrModel.OcrTableResult.class);
|
|
|
+ if (null == tableResult.tableNum || tableResult.tableNum <= 0) {
|
|
|
+ lineList.add(new String[]{file.getName(), "0", "0", "0"});
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ int i = 1;
|
|
|
+ for (OcrModel.OcrTableCells tbl : tableResult.tables) {
|
|
|
+ int rowStart = 0, rowEnd = 0, colStart = 0, colEnd = 0;
|
|
|
+ for (OcrModel.OcrTableCell oc : tbl.getCells()) {
|
|
|
+ rowStart = Math.min(oc.getRowStart(), rowStart);
|
|
|
+ rowEnd = Math.max(oc.getRowEnd(), rowEnd);
|
|
|
+ colStart = Math.min(oc.getColStart(), colStart);
|
|
|
+ colEnd = Math.max(oc.getColEnd(), colEnd);
|
|
|
+ }
|
|
|
+ lineList.add(new String[]{file.getName(), String.valueOf(i++), String.valueOf(rowEnd - rowStart), String.valueOf(colEnd - colStart)});
|
|
|
+ }
|
|
|
+ }
|
|
|
+ CSVWriter writer = new CSVWriter(new FileWriter("D:\\data\\ocr\\1111out.csv"));
|
|
|
+ for (String[] line : lineList) {
|
|
|
+ writer.writeNext(line);
|
|
|
+ }
|
|
|
+ writer.flush();
|
|
|
+ }
|
|
|
+
|
|
|
+ @Test
|
|
|
+ /**
|
|
|
+ * 按原始OCR结果导出Excel查看
|
|
|
+ */
|
|
|
+ public void testExportOriExcel() throws IOException, JSONException {
|
|
|
+ String jsonFileName = "D:\\data\\ocr\\8888\\" + "20240418004338611_0031p4";
|
|
|
+ OcrExcelExporter.exportOriData(FileUtils.readFileToString(new File(jsonFileName + ".json"), "utf-8"), jsonFileName + ".xlsx");
|
|
|
+ }
|
|
|
+
|
|
|
+ @Test
|
|
|
+ /**
|
|
|
+ * 模式一提取
|
|
|
+ */
|
|
|
+ public void testExtractExcel() throws IOException, JSONException {
|
|
|
+ Integer validCellCount = 5;
|
|
|
+
|
|
|
+ String data = FileUtils.readFileToString(new File("D:\\data\\ocr\\8888\\20240418004338611_0031p4.json"), "utf-8");
|
|
|
+ ObjectMapper om = new ObjectMapper();
|
|
|
+ om.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
|
+ OcrModel.OcrTableResult tableResult = om.readValue(data, OcrModel.OcrTableResult.class);
|
|
|
+ if (null == tableResult.tableNum || tableResult.tableNum <= 0) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ Collections.sort(tableResult.tables, new Comparator<OcrModel.OcrTableCells>() {
|
|
|
+ @Override
|
|
|
+ public int compare(OcrModel.OcrTableCells o1, OcrModel.OcrTableCells o2) {
|
|
|
+ int iRet;
|
|
|
+ if (0 != (iRet = o1.getLocations().get(0).getX().compareTo(o2.getLocations().get(0).getY()))) {
|
|
|
+ return iRet;
|
|
|
+ }
|
|
|
+ if (0 != (iRet = o1.getLocations().get(0).getY().compareTo(o2.getLocations().get(0).getY()))) {
|
|
|
+ return iRet;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ });
|
|
|
+ List<OcrModel.MajorRow> finalRowList = Lists.newArrayList();
|
|
|
+ OcrModel.MajorRow headRow = null;
|
|
|
+ OcrModel.MajorRow lastRow = new OcrModel.MajorRow();
|
|
|
+ for (OcrModel.OcrTableCells tbl : tableResult.tables) {
|
|
|
+ System.out.println("process table");
|
|
|
+ Collections.sort(tbl.getCells(), new Comparator<OcrModel.OcrTableCell>() {
|
|
|
+ @Override
|
|
|
+ public int compare(OcrModel.OcrTableCell o1, OcrModel.OcrTableCell o2) {
|
|
|
+ int iRet;
|
|
|
+ if (0 != (iRet = o1.getRowStart().compareTo(o2.getRowStart()))) {
|
|
|
+ return iRet;
|
|
|
+ }
|
|
|
+ if (0 != (iRet = o1.getColStart().compareTo(o2.getColStart()))) {
|
|
|
+ return iRet;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ });
|
|
|
+ Map<Integer, OcrModel.OcrTableRow> rowMap = new HashMap<>();
|
|
|
+ List<OcrModel.OcrTableRow> rowList = new ArrayList<>();
|
|
|
+ for (OcrModel.OcrTableCell cell : tbl.getCells()) {
|
|
|
+ if (cell.getRowEnd() - cell.getRowStart() > 1) {
|
|
|
+ System.out.println("多行错误");
|
|
|
+ }
|
|
|
+ OcrModel.OcrTableRow row = rowMap.get(cell.getRowStart());
|
|
|
+ if (null == row) {
|
|
|
+ if (0 != cell.getColStart()) {
|
|
|
+ System.out.println("起行错误");
|
|
|
+ }
|
|
|
+ row = new OcrModel.OcrTableRow();
|
|
|
+ rowList.add(row);
|
|
|
+ rowMap.put(cell.getRowStart(), row);
|
|
|
+ }
|
|
|
+ if (StringUtils.isNotBlank(cell.getWords())) {
|
|
|
+ row.setValidCount(row.getValidCount() + 1);
|
|
|
+ }
|
|
|
+ row.getCells().add(cell);
|
|
|
+ }
|
|
|
+ // 处理行粘连
|
|
|
+ List<OcrModel.OcrTableRow> newRowList = Lists.newArrayList();
|
|
|
+ for (OcrModel.OcrTableRow row : rowList) {
|
|
|
+ if (row.getCells().size() != validCellCount) {
|
|
|
+ // 处理列粘边
|
|
|
+ System.out.println("有列粘连");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ OcrModel.RowSplitter.addRowForSplit(row, newRowList);
|
|
|
+ }
|
|
|
+ for (OcrModel.OcrTableRow row : newRowList) {
|
|
|
+ OcrModel.MajorRow finalRow = new OcrModel.MajorRow();
|
|
|
+ if (row.getValidCount() == 2) {
|
|
|
+ for (OcrModel.OcrTableCell tc : row.getCells()) {
|
|
|
+ if (StringUtils.isNotBlank(tc.getWords())) {
|
|
|
+ if (null == finalRow.getUniversityName()) {
|
|
|
+ finalRow.setUniversityName(tc.getWords().replaceAll("\n|\r", ""));
|
|
|
+ } else {
|
|
|
+ finalRow.setUniversityCount(tc.getWords().replaceAll("\n|\r", ""));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ lastRow = finalRow;
|
|
|
+ continue;
|
|
|
+ } else if (row.getCells().size() != validCellCount) {
|
|
|
+ System.out.println("错误行: ");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ String name = row.getCells().get(1).getWords();
|
|
|
+ String plan = row.getCells().get(3).getWords().replaceAll("\n|#", "");
|
|
|
+ if (!NumberUtils.isDigits(plan) && plan.contains("数")) {
|
|
|
+ if (null == headRow) {
|
|
|
+ headRow = new OcrModel.MajorRow();
|
|
|
+ headRow.setCollegeCode("院校代码");
|
|
|
+ headRow.setUniversityName("院校名称");
|
|
|
+ headRow.setUniversityCount("院校人数");
|
|
|
+ headRow.setGroup("专业组代码");
|
|
|
+ headRow.setGroupName("院校专业组");
|
|
|
+ headRow.setGroupCount("专业组人数");
|
|
|
+ headRow.setCourse(row.getCells().get(2).getWords().replaceAll("\n|\r", ""));
|
|
|
+ headRow.setCode("专业代号");
|
|
|
+ headRow.setName("专业名称及备注");
|
|
|
+ headRow.setPlanCount("专业人数");
|
|
|
+ headRow.setXuezhi("学制");
|
|
|
+ headRow.setXuefei(row.getCells().get(4).getWords().replaceAll("\n|\r", ""));
|
|
|
+ headRow.setRemark("备注");
|
|
|
+ } else {
|
|
|
+ System.out.println("标题重复错新一页");
|
|
|
+ }
|
|
|
+ continue;
|
|
|
+ } else if(RegExUtils.dotAllMatcher(".*第\\d+组", name).find()) {
|
|
|
+ lastRow.setGroup(row.getCells().get(0).getWords().replaceAll("\n|\r", ""));
|
|
|
+ lastRow.setGroupName(row.getCells().get(1).getWords().replaceAll("\n|\r", ""));
|
|
|
+ lastRow.setCourse(row.getCells().get(2).getWords().replaceAll("\n|\r", ""));
|
|
|
+ lastRow.setGroupCount(plan);
|
|
|
+ BeanUtils.copyProperties(lastRow, finalRow);
|
|
|
+ } else {
|
|
|
+ BeanUtils.copyProperties(lastRow, finalRow);
|
|
|
+ finalRow.setCode(row.getCells().get(0).getWords().replaceAll("\n|\r", ""));
|
|
|
+ finalRow.setName(row.getCells().get(1).getWords().replaceAll("\n|\r", ""));
|
|
|
+ finalRow.setPlanCount(plan);
|
|
|
+ finalRow.setXuefei(row.getCells().get(4).getWords());
|
|
|
+ }
|
|
|
+ finalRowList.add(finalRow);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (null != headRow) {
|
|
|
+ finalRowList.add(0, headRow);
|
|
|
+ }
|
|
|
+ System.out.println("end: " + finalRowList.size());
|
|
|
+ CSVWriter writer = new CSVWriter(new FileWriter("D:\\data\\ocr\\2024\\test1.csv"));
|
|
|
+ for (OcrModel.MajorRow row : finalRowList) {
|
|
|
+ writer.writeNext(new String[]{row.collegeCode, row.universityName, row.universityCount,
|
|
|
+ row.group, row.groupName, row.groupCount, row.course,
|
|
|
+ row.code, row.name, row.planCount, row.xuezhi, row.xuefei, row.remark});
|
|
|
+ }
|
|
|
+ writer.flush();
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ @Test
|
|
|
+ public void testExtractExcel2() throws IOException, JSONException {
|
|
|
+ /*String data = FileUtils.readFileToString(new File("D:\\data\\ocr\\2024\\autoGrab\\北京大学\\2022-理科-本一-北京大学(1003)((校址北京市)).json"), "utf-8");
|
|
|
+ ObjectMapper om = new ObjectMapper();
|
|
|
+ om.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
|
+ OcrTableResult tableResult = om.readValue(data, OcrTableResult.class);
|
|
|
+ if (null == tableResult.tableNum || tableResult.tableNum <= 0) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ Collections.sort(tableResult.tables, new Comparator<OcrTable>() {
|
|
|
+ @Override
|
|
|
+ public int compare(OcrTable o1, OcrTable o2) {
|
|
|
+ int iRet;
|
|
|
+ if (0 != (iRet = o1.getLocations().get(0).x.compareTo(o2.getLocations().get(0).x))) {
|
|
|
+ return iRet;
|
|
|
+ }
|
|
|
+ if (0 != (iRet = o1.getLocations().get(0).y.compareTo(o2.getLocations().get(0).y))) {
|
|
|
+ return iRet;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ });
|
|
|
+ List<FinalRow> finalRowList = Lists.newArrayList();
|
|
|
+ FinalRow headRow = null;
|
|
|
+ FinalRow lastRow = new FinalRow();
|
|
|
+ for (OcrTable tbl : tableResult.tables) {
|
|
|
+ System.out.println("process table");
|
|
|
+ Collections.sort(tbl.getCells(), new Comparator<OcrCell>() {
|
|
|
+ @Override
|
|
|
+ public int compare(OcrCell o1, OcrCell o2) {
|
|
|
+ int iRet;
|
|
|
+ if (0 != (iRet = o1.getRowStart().compareTo(o2.getRowStart()))) {
|
|
|
+ return iRet;
|
|
|
+ }
|
|
|
+ if (0 != (iRet = o1.getColStart().compareTo(o2.getColStart()))) {
|
|
|
+ return iRet;
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ });
|
|
|
+ Map<Integer, OcrRow> rowMap = new HashMap<>();
|
|
|
+ List<OcrRow> rowList = new ArrayList<>();
|
|
|
+ for (OcrCell cell : tbl.getCells()) {
|
|
|
+ if (cell.rowEnd - cell.rowStart > 1) {
|
|
|
+ System.out.println("多行错误");
|
|
|
+ }
|
|
|
+ OcrRow row = rowMap.get(cell.rowStart);
|
|
|
+ if (null == row) {
|
|
|
+ if (0 != cell.colStart) {
|
|
|
+ System.out.println("起行错误");
|
|
|
+ }
|
|
|
+ row = new OcrRow();
|
|
|
+ rowList.add(row);
|
|
|
+ rowMap.put(cell.rowStart, row);
|
|
|
+ }
|
|
|
+ if (StringUtils.isNotBlank(cell.getWords())) {
|
|
|
+ row.validCount++;
|
|
|
+ }
|
|
|
+ row.getCells().add(cell);
|
|
|
+ }
|
|
|
+ for (OcrRow row : rowList) {
|
|
|
+ FinalRow finalRow = new FinalRow();
|
|
|
+ if (row.validCount == 1) {
|
|
|
+ for (OcrCell tc : row.cells) {
|
|
|
+ if (StringUtils.isNotBlank(tc.getWords())) {
|
|
|
+ finalRow.setUniversityName(tc.getWords().replaceAll("\n|\r", ""));
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ lastRow = finalRow;
|
|
|
+ continue;
|
|
|
+ } else if (row.cells.size() != 6) {
|
|
|
+ System.out.println("错误行: ");
|
|
|
+ } else if (row.cells.get(3).getWords().contains("计") || row.cells.get(3).getWords().contains("划")) {
|
|
|
+ if(null == headRow) {
|
|
|
+ headRow = new FinalRow();
|
|
|
+ headRow.setUniversityName("院校名称");
|
|
|
+ headRow.setUniversityCount("院校计划数");
|
|
|
+ headRow.setCode(row.cells.get(0).getWords().replaceAll("\n|\r", ""));
|
|
|
+ headRow.setName(row.cells.get(1).getWords().replaceAll("\n|\r", ""));
|
|
|
+ headRow.setRemark(row.cells.get(2).getWords().replaceAll("\n|\r", ""));
|
|
|
+ headRow.setPlanCount(row.cells.get(3).getWords().replaceAll("\n|\r", ""));
|
|
|
+ headRow.setXuezhi(row.cells.get(4).getWords().replaceAll("\n|\r", ""));
|
|
|
+ headRow.setXuefei(row.cells.get(5).getWords().replaceAll("\n|\r", ""));
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ finalRow.setUniversityName(lastRow.getUniversityName());
|
|
|
+ finalRow.setUniversityCount(lastRow.universityCount);
|
|
|
+ finalRow.setCode(row.cells.get(0).getWords());
|
|
|
+ finalRow.setName(row.cells.get(1).getWords().replaceAll("\n|\r", ""));
|
|
|
+ finalRow.setRemark(row.cells.get(2).getWords().replaceAll("\n|\r", ""));
|
|
|
+ finalRow.setPlanCount(row.cells.get(3).getWords());
|
|
|
+ finalRow.setXuezhi(row.cells.get(4).getWords());
|
|
|
+ finalRow.setXuefei(row.cells.get(5).getWords());
|
|
|
+ finalRowList.add(finalRow);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (null != headRow) {
|
|
|
+ finalRowList.add(0, headRow);
|
|
|
+ }
|
|
|
+ System.out.println("end: " + finalRowList.size());
|
|
|
+ CSVWriter writer = new CSVWriter(new FileWriter("D:\\data\\ocr\\testYZY.csv"));
|
|
|
+ for (FinalRow row : finalRowList) {
|
|
|
+ writer.writeNext(new String[]{row.universityName, row.universityCount, row.code, row.name, row.remark, row.planCount, row.xuezhi, row.xuefei});
|
|
|
+ }
|
|
|
+ writer.flush();*/
|
|
|
+ }
|
|
|
+
|
|
|
+}
|