问题及现象
在OpenXML中文件不包含空白单元格的条目,这就是跳过空白单元格的原因。
所以如果当我们打开一个excel,读取一个表格数据,发现如果有空单元格,openXML会跳过导致读取的数据发生错位。
比如这个是原始的excel表格数据。
年级 | 班级 | 身份标识 | 姓名 | 性别 | 年龄 |
一年级 | 2024010102 | 女 | 15 | ||
一年级 | 一班 | 2024010103 | 王五 | 男 | 14 |
一年级 | 一班 | 2024010104 | 赵六 | 男 | 14是 |
一年级 | 一班 | 2024010105 | 钱七 | 女 | 14 |
一年级 | 2024010106 | 孙八 | 男 | 12 |
当有空格读取后,第一行和最后一行的数据就会错位了,如下:
年级 | 班级 | 身份标识 | 姓名 | 性别 | 年龄 |
一年级 | 2024010102 | 女 | 15 | ||
一年级 | 一班 | 2024010103 | 王五 | 男 | 14 |
一年级 | 一班 | 2024010104 | 赵六 | 男 | 14是 |
一年级 | 一班 | 2024010105 | 钱七 | 女 | 14 |
一年级 | 2024010106 | 孙八 | 男 | 12 |
解决的办法就是:
假设:
DocumentFormat.OpenXml.Spreadsheet.Row row,
Cell cell = row.Elements<Cell>().FirstOrDefault(c => c.CellReference == $"列行");
//例如:c.CellReference =="A12"
当cell是空的时候,表示该单元格是空值。
因此对于从行中提取单元格不能使用“foreach (Cell cell in row)”,这样取出来的cell是非空的单元格,也就是如果你有10列,有两列是空单元格,那么就只能取出来8列,这就导致了取出数据的错位。
而应该使用循环,也就是知道了表格的列数了,然后使用for去循环,例如:
for (int columnIndex = 0; columnIndex < dt.Columns.Count; columnIndex++){//查找指定的行列单元格是否存在。Cell cell = row.Elements<Cell>().FirstOrDefault(c => c.CellReference == $"{columnReference[columnIndex]}{row.RowIndex}");//例如:c.CellReference =="A12"string cellVal = null; //定义获取的单元格的值,默认为空if (cell != null){//不为空使用定义的GetCellValue()函数获取cell中的值cellVal = GetCellValue(cell, workbookPart);}....}
以下为封装的OpenXML处理的完整代码
调用ReadSheetWithHeader()函数,ReadSheetWithHeader会调用封装的OpenXml类OutExcel对象,从而把指定的excel文件的sheet表读取到DataTable的数据集合中。
/// <summary>/// 将指定的excel文件中的指定索引的sheet读取到表对象中/// </summary>/// <param name="fileNm">excel文件路径</param>/// <param name="sheetIndex">sheet索引</param>/// <returns>返回DataTable对象</returns>public DataTable ReadSheetWithHeader(string fileNm, int sheetIndex){FileStream fs = new FileStream(fileNm, FileMode.Open, FileAccess.Read, FileShare.Read);DataTable dt = new OutExcel().ReadExcel(sheetIndex, fs);return dt;}
封装OpenXml类OutExcel
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
using System;
using System.Collections.Generic;
using System.Data;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;namespace OpenExcelMng
{public class OutExcel{/// <summary>/// 按照给定的Excel流组织成Datatable/// </summary>/// <param name="sheetName">须要读取的Sheet的名称</param>/// <param name="stream">Excel文件流</param>/// <returns>组织好的DataTable</returns>public DataTable ReadExcel(string sheetName, Stream stream){using (SpreadsheetDocument document = SpreadsheetDocument.Open(stream, false)){ //打开StreamWorkbookPart workbookPart = document.WorkbookPart;IEnumerable<Sheet> sheets = workbookPart.Workbook.Descendants<Sheet>().Where(s => s.Name == sheetName);if (sheets.Count() == 0){//找出合适前提的sheet,没有则返回 return null;}WorksheetPart worksheetPart = (WorksheetPart)document.WorkbookPart.GetPartById(sheets.First().Id);//获取Excel中共享数据SharedStringTable stringTable = document.WorkbookPart.SharedStringTablePart.SharedStringTable;IEnumerable<Row> rows = worksheetPart.Worksheet.Descendants<Row>();//获得Excel中得数据行DataTable dt = new DataTable("Excel");//因为须要将数据导入到DataTable中,所以我们假定Excel的第一行是列名,从第二行开端是行数据foreach (Row row in rows){if (row.RowIndex == 1){//Excel第一行动列名GetDataColumn(row, stringTable, ref dt);}else{GetDataRow(row, stringTable, workbookPart, ref dt);//Excel第二行同时为DataTable的第一行数据}}return dt;}}/// <summary>/// 按照给定的Excel流组织成Datatable/// </summary>/// <param name="sheetIndex">须要读取的Sheet的索引</param>/// <param name="sheetIndex">Excel文件流</param>/// <returns>组织好的DataTable</returns>public DataTable ReadExcel(int sheetIndex, Stream stream){using (SpreadsheetDocument document = SpreadsheetDocument.Open(stream, false)){//打开StreamWorkbookPart workbookPart = document.WorkbookPart;IList<Sheet> sheets = workbookPart.Workbook.Descendants<Sheet>().ToList();if (sheets.Count() == 0){//找出合适前提的sheet,没有则返回 return null;}WorksheetPart worksheetPart = (WorksheetPart)document.WorkbookPart.GetPartById(sheets[sheetIndex].Id);//获取Excel中共享数据SharedStringTable stringTable = document.WorkbookPart.SharedStringTablePart.SharedStringTable;IEnumerable<Row> rows = worksheetPart.Worksheet.Descendants<Row>();//获得Excel中得数据行DataTable dt = new DataTable("Excel");//因为须要将数据导入到DataTable中,所以我们假定Excel的第一行是列名,从第二行开端是行数据foreach (Row row in rows){if (row.RowIndex == 1){//Excel第一行动列名GetDataColumn(row, stringTable, ref dt);}else{GetDataRow(row, stringTable, workbookPart, ref dt);//Excel第二行同时为DataTable的第一行数据}}return dt;}}/// <summary>/// 构建DataTable的列/// </summary>/// <param name="row">OpenXML定义的Row对象</param>/// <param name="stringTablePart"></param>/// <param name="dt">须要返回的DataTable对象</param>/// <returns></returns>public void GetDataColumn(Row row, SharedStringTable stringTable, ref DataTable dt){DataColumn col = new DataColumn();Dictionary<string, int> columnCount = new Dictionary<string, int>();foreach (Cell cell in row){string cellVal = GetValue(cell, stringTable);col = new DataColumn(cellVal);if (IsContainsColumn(dt, col.ColumnName)){if (!columnCount.ContainsKey(col.ColumnName))columnCount.Add(col.ColumnName, 0);col.ColumnName = col.ColumnName + (columnCount[col.ColumnName]++);}dt.Columns.Add(col);}}/// <summary>/// 构建DataTable的每一行数据,并返回该Datatable/// </summary>/// <param name="row"></param>/// <param name="stringTable"></param>/// <param name="workbookPart">用于处理获取Cell中的信息,如果Cell存在,不是空单元格</param>/// <param name="dt">把行数据写入到datatabl中</param>private void GetDataRow(DocumentFormat.OpenXml.Spreadsheet.Row row,DocumentFormat.OpenXml.Spreadsheet.SharedStringTable stringTable, //不再使用DocumentFormat.OpenXml.Packaging.WorkbookPart workbookPart, //用于处理获取Cell中的信息,如果Cell存在,不是空单元格ref System.Data.DataTable dt) //把行数据写入到datatabl中。{// 读取算法:按行一一读取单位格,若是整行均是空数据// 则忽视改行(因为本人的工作内容不须要空行)-_-DataRow dr = dt.NewRow();int i = 0;int nullRowCount = i;Dictionary<int, string> columnReference = new Dictionary<int, string>();columnReference.Add(0, "A");columnReference.Add(1, "B");columnReference.Add(2, "C");columnReference.Add(3, "D");columnReference.Add(4, "E");columnReference.Add(5, "F");columnReference.Add(6, "G");columnReference.Add(7, "H");for (int columnIndex = 0; columnIndex < dt.Columns.Count; columnIndex++){Cell cell = row.Elements<Cell>().FirstOrDefault(c => c.CellReference == $"{columnReference[columnIndex]}{row.RowIndex}");//例如:c.CellReference =="A12"string cellVal = null;if (cell != null){cellVal = GetCellValue(cell, workbookPart);}if (string.IsNullOrEmpty(cellVal)){nullRowCount++;}dr[i] = cellVal;i++;} if (nullRowCount != i){dt.Rows.Add(dr);}}/// <summary>/// 获取单位格的值/// </summary>/// <param name="cell">单元格</param>/// <param name="workbookPart"></param>/// <param name="type">1 不去空格 2 前后空格 3 所有空格 </param>/// <returns></returns>public static string GetCellValue(Cell cell, WorkbookPart workbookPart, int type = 2){//合并单元格不做处理if (cell.CellValue == null)return string.Empty;string cellInnerText = cell.CellValue.InnerXml;//纯字符串if (cell.DataType != null && (cell.DataType.Value == CellValues.SharedString || cell.DataType.Value == CellValues.String || cell.DataType.Value == CellValues.Number)){//获取spreadsheetDocument中共享的数据SharedStringTable stringTable = workbookPart.SharedStringTablePart.SharedStringTable;//如果共享字符串表丢失,则说明出了问题。if (!stringTable.Any())return string.Empty;string text = stringTable.ElementAt(int.Parse(cellInnerText)).InnerText;if (type == 2)return text.Trim();else if (type == 3)return text.Replace(" ", "");elsereturn text;}//bool类型else if (cell.DataType != null && cell.DataType.Value == CellValues.Boolean){return (cellInnerText != "0").ToString().ToUpper();}//数字格式代码(numFmtId)小于164是内置的:https://www.it1352.com/736329.htmlelse{//为空为数值if (cell.StyleIndex == null)return cellInnerText;Stylesheet styleSheet = workbookPart.WorkbookStylesPart.Stylesheet;CellFormat cellFormat = (CellFormat)styleSheet.CellFormats.ChildElements[(int)cell.StyleIndex.Value];uint formatId = cellFormat.NumberFormatId.Value;double doubleTime;//OLE 自动化日期值DateTime dateTime;//yyyy/MM/dd HH:mm:ssswitch (formatId){case 0://常规return cellInnerText;case 9://百分比【0%】case 10://百分比【0.00%】case 11://科学计数【1.00E+02】case 12://分数【1/2】return cellInnerText;case 14:doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("yyyy/MM/dd");//case 15://case 16:case 17:doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("yyyy/MM");//case 18://case 19:case 20:doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("H:mm");case 21:doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("HH:mm:ss");case 22:doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("yyyy/MM/dd HH:mm");//case 45://case 46:case 47:doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("yyyy/MM/dd");case 58://【中国】11月11日doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("MM/dd");case 176://【中国】2020年11月11日doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("yyyy/MM/dd");case 177://【中国】11:22:00doubleTime = double.Parse(cellInnerText);dateTime = DateTime.FromOADate(doubleTime);return dateTime.ToString("HH:mm:ss");default:return cellInnerText;}}}/// <summary>/// 获取单位格的值/// </summary>/// <param name="cell"></param>/// <param name="stringTablePart"></param>/// <returns></returns>private string GetValue(Cell cell, SharedStringTable stringTable){//因为Excel的数据存储在SharedStringTable中,须要获取数据在SharedStringTable 中的索引string value = string.Empty;try{if (cell.ChildElements.Count == 0)return value;value = double.Parse(cell.CellValue.InnerText).ToString();if (cell.DataType != null){switch (cell.DataType.Value){case CellValues.SharedString:value = stringTable.ChildElements[Int32.Parse(value)].InnerText; break;}}}catch (Exception ex){value = "N/A";}return value;}/// <summary>/// 判断网格是否存在列/// </summary>/// <param name="dt">网格</param>/// <param name="columnName">列名</param>/// <returns></returns>public bool IsContainsColumn(DataTable dt, string columnName){if (dt == null || columnName == null){return false;}return dt.Columns.Contains(columnName);}public static void ConvertToDateTime(ref DataTable dt, string columnNm, string dtFormat){int findLoca_Old = dt.Columns.IndexOf(columnNm);DataColumn newColumn = new DataColumn(System.Guid.NewGuid().ToString(), typeof(String));string newColumnNm = newColumn.ColumnName;dt.Columns.Add(newColumn);newColumn.SetOrdinal(findLoca_Old + 1);foreach (DataRow row in dt.Rows){try{double val = Convert.ToDouble(row[columnNm]);row[newColumnNm] = DateTime.FromOADate(val).ToString(dtFormat);}catch (Exception ex){;}}dt.Columns.RemoveAt(findLoca_Old);newColumn.ColumnName = columnNm;}}
}