Small Note-
Office Open XML (also informally known as OOXML or OpenXML) is a zipped,
XML-based file format developed by Microsoft[2] for representing spreadsheets,
charts, presentations and word processing documents.
Reason of using-
I have used in one my project, to read the xlsx files. Before it was done using the OLEDB.
The reason of changing code and making use of Open Xml is to get the full control of Xlsx file properties.
Say, we require in detail information about the sheets like, Sheet index , Sheet name , State whether the sheet is
hidden or visible or very hidden. From these properties we can query for required sheet data and filtering out the
other sheets.
A diagram of OpenXML spreadsheet, how objects looks like-
The structure of OpenXML spreadsheet is something like this from what I can tell:
Spreadsheet
|
WorkbookPart
/ | \
Workbook WorkbookStylesPart WorksheetPart
| | |
Sheets StyleSheet Worksheet
| / \
(refers to SheetData Columns
Worksheetparts) |
Rows
I am going to paste the snippet of the code of all method is required. This could help you in your project.
I have created 3 class
Excel (Object of this will have access of excel file property, gets the sheets information)
Sheet (Object of this will have access of Sheet property, gets the Cells information)
Cell (Object of this will have access of Cell property)
Excel -> Sheet - > Cell
I am going to point out main methods and its importance.
1. How to Load the cell Formats?
I have took one dictionary object which well be filled with the formats -
Cell Format has the formatId and formatstring.
Key - formatId
Value - formatString
Loading the cell formats into the Dictionary
formatMappings = new Dictionary<uint, string>();
var stylePart = _spreadSheetDocument.WorkbookPart.WorkbookStylesPart;
var numFormatsParentNodes = stylePart.Stylesheet.ChildElements.OfType<NumberingFormats>();
foreach (var numFormatParentNode in numFormatsParentNodes)
{
var formatNodes = numFormatParentNode.ChildElements.OfType<NumberingFormat>();
foreach (var formatNode in formatNodes)
{
_formatMappings.Add(formatNode.NumberFormatId.Value, formatNode.FormatCode);
}
}
//formatMappings.Add(1, "0");
//formatMappings.Add(2, "0.00");
//formatMappings.Add(3, "#,##0");
//formatMappings.Add(4, "#,##0.00");
//formatMappings.Add(9, "0%");
//formatMappings.Add(10, "0.00%");
//formatMappings.Add(11, "0.00E+00");
//formatMappings.Add(12, "# ?/?");
//formatMappings.Add(13, "# ??/??");
//formatMappings.Add(14, "mm-dd-yy");
//formatMappings.Add(15, "d-mmm-yy");
//formatMappings.Add(16, "d-mmm");
//formatMappings.Add(17, "mmm-yy");
//formatMappings.Add(18, "h:mm AM/PM");
//formatMappings.Add(19, "h:mm:ss AM/PM");
//formatMappings.Add(20, "h:mm");
//formatMappings.Add(21, "h:mm:ss");
//formatMappings.Add(22, "m/d/yy h:mm");
//formatMappings.Add(37, "#,##0 ;(#,##0)");
//formatMappings.Add(38, "#,##0 ;(#,##0)");
//formatMappings.Add(39, "#,##0.00;(#,##0.00)");
//formatMappings.Add(40, "#,##0.00;(#,##0.00)");
//formatMappings.Add(45, "mm:ss");
//formatMappings.Add(46, "[h]:mm:ss");
//formatMappings.Add(47, "mmss.0");
//formatMappings.Add(48, "##0.0E+0");
//formatMappings.Add(49, "@");
This COM is used to get cell value to be format.
[DllImport("oleaut32.dll")]
public static extern int VarFormat(
ref object o,
[MarshalAs(UnmanagedType.BStr)]
string format,
int firstDay,
int firstWeek,
uint flags,
[MarshalAs(UnmanagedType.BStr)]
ref string output);
2. How to Load the sharedString?
Location is sharedStrings.xml.
SharedString are the cell value whose datatype are 's' string.
Loading into the list<string>
var sheets = _spreadSheetDocument.WorkbookPart.Workbook.Descendants<Sheet>();
List<string> sharedStringTablePart = new List<string>();
if (SpreadSheetDoc.WorkbookPart.SharedStringTablePart != null)
{
sharedStringTablePart = SpreadSheetDoc.WorkbookPart.
SharedStringTablePart.SharedStringTable.
ChildElements.Select(ce => ce.InnerText != null ? ce.InnerText : "").ToList();
}
Usage
var sheets = _spreadSheetDocument.WorkbookPart.Workbook.Descendants<Sheet>();
List<string> sharedStringTablePart = new List<string>();
if (SpreadSheetDoc.WorkbookPart.SharedStringTablePart != null)
{
sharedStringTablePart = SpreadSheetDoc.WorkbookPart.
SharedStringTablePart.SharedStringTable.
ChildElements.Select(ce => ce.InnerText != null ? ce.InnerText : "").ToList();
}
Class file.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Data;
using System.IO;
using System.IO.Packaging;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
using System.Text.RegularExpressions;
using System.Runtime.InteropServices;
using System.Globalization;
using System.Diagnostics;
public class OXExcel
{
private string _fileName;
internal static SpreadsheetDocument _spreadSheetDocument;
internal static Dictionary<uint, String> _formatMappings = new Dictionary<uint, string>();//Dictionary of number formats , loaded from the style.xml.
internal static Regex _rx_dateFmtChk = new Regex("[A-Za-z]+", RegexOptions.Compiled);
public OXExcel(string fileName)
{
_fileName = fileName;
}
public void Initialize()
{
OpenFile();
BuildFormatMappingsFromXlsx();
}
public void Finalizer()
{
_formatMappings.Clear();
CloseFile();
}
/// <summary>
/// Property returning the Format Dictionary.
/// </summary>
internal Dictionary<uint, String> FormatMappings
{
get
{
return _formatMappings;
}
}
/// <summary>
/// Fills the Dictionary with the format(number/date/currency). It is found in style.xml.
/// </summary>
/// <returns></returns>
private Dictionary<uint, String> BuildFormatMappingsFromXlsx()
{
try
{
_formatMappings = new Dictionary<uint, string>();
var stylePart = _spreadSheetDocument.WorkbookPart.WorkbookStylesPart;
var numFormatsParentNodes = stylePart.Stylesheet.ChildElements.OfType<NumberingFormats>();
foreach (var numFormatParentNode in numFormatsParentNodes)
{
var formatNodes = numFormatParentNode.ChildElements.OfType<NumberingFormat>();
foreach (var formatNode in formatNodes)
{
_formatMappings.Add(formatNode.NumberFormatId.Value, formatNode.FormatCode);
}
}
//formatMappings.Add(1, "0");
//formatMappings.Add(2, "0.00");
//formatMappings.Add(3, "#,##0");
//formatMappings.Add(4, "#,##0.00");
//formatMappings.Add(9, "0%");
//formatMappings.Add(10, "0.00%");
//formatMappings.Add(11, "0.00E+00");
//formatMappings.Add(12, "# ?/?");
//formatMappings.Add(13, "# ??/??");
//formatMappings.Add(14, "mm-dd-yy");
//formatMappings.Add(15, "d-mmm-yy");
//formatMappings.Add(16, "d-mmm");
//formatMappings.Add(17, "mmm-yy");
//formatMappings.Add(18, "h:mm AM/PM");
//formatMappings.Add(19, "h:mm:ss AM/PM");
//formatMappings.Add(20, "h:mm");
//formatMappings.Add(21, "h:mm:ss");
//formatMappings.Add(22, "m/d/yy h:mm");
//formatMappings.Add(37, "#,##0 ;(#,##0)");
//formatMappings.Add(38, "#,##0 ;(#,##0)");
//formatMappings.Add(39, "#,##0.00;(#,##0.00)");
//formatMappings.Add(40, "#,##0.00;(#,##0.00)");
//formatMappings.Add(45, "mm:ss");
//formatMappings.Add(46, "[h]:mm:ss");
//formatMappings.Add(47, "mmss.0");
//formatMappings.Add(48, "##0.0E+0");
//formatMappings.Add(49, "@");
}
finally
{
}
return _formatMappings;
}
/// <summary>
/// Property returing the spreadsheetDocument object.
/// </summary>
internal SpreadsheetDocument SpreadSheetDoc
{
get
{
return _spreadSheetDocument;
}
}
/// <summary>
/// Opens the Excel file.
/// </summary>
public void OpenFile()
{
_spreadSheetDocument = SpreadsheetDocument.Open(_fileName, false);
}
/// <summary>
/// Closes the Excel file.
/// </summary>
public void CloseFile()
{
try
{
_spreadSheetDocument.Close();
}
catch (Exception ex)
{
if (ex.Message.Contains("Cannot access a disposed object"))
{
//do nothing.
}
else
{
throw ex;
}
}
}
/// <summary>
/// Returns the list of sheets present in the Excel file.
/// List holds the object of OXSheet class.
/// </summary>
/// <returns></returns>
public List<OXSheet> GetSheetInfo()
{
List<OXSheet> lstOxSheet = new List<OXSheet>();
try
{
var sheets = _spreadSheetDocument.WorkbookPart.Workbook.Descendants<Sheet>();
List<string> sharedStringTablePart = new List<string>();
if (SpreadSheetDoc.WorkbookPart.SharedStringTablePart != null)
{
sharedStringTablePart = SpreadSheetDoc.WorkbookPart.
SharedStringTablePart.SharedStringTable.
ChildElements.Select(ce => ce.InnerText != null ? ce.InnerText : "").ToList();
}
_log.Debug("sharedStringTablePart count:"+sharedStringTablePart.Count);
sheets.ToList().ForEach(
sh => lstOxSheet.Add(
new OXSheet(
(WorksheetPart)SpreadSheetDoc.WorkbookPart.GetPartById(sh.Id.Value),/*PARAM 1*/
SpreadSheetDoc.WorkbookPart.WorkbookStylesPart.Stylesheet, sharedStringTablePart,/*PARAM 2*/
sh /*PARAM 3*/)));
}
finally
{
}
return lstOxSheet;
}
/// <summary>
/// Returns the list of Albhabets same as Excel columns.
/// Item values are used for creating the datacolumn name.
/// </summary>
/// <param name="num"></param>
/// <returns></returns>
public List<string> ExcelColumns(int num)
{
_log.Debug("[BEGIN][ExcelColumns]");
List<string> col = new List<string>();
string temp = "";
int set = 0;
int j = 1;
List<string> alp = new List<string>() { "A","B","C","D","E","F","G","H","I","J","K","L",
"M","N","O","P","Q","R","S","T","U","V","W","X",
"Y","Z"};
for (int i = 0; i < num; i++)
{
col.Add(temp + alp[j - 1]);
if (j % alp.Count == 0)
{
temp = col[set];
set++;
j = 0;
}
j++;
}
_log.Debug("[RETURNS][ExcelColumns], COLUMN COUNT:"+col.Count);
return col;
}
}
/// <summary>
/// Excel Sheet::
/// Object of this will have information of the sheet visibility,cell values,number of rows/columns.
/// </summary>
///
public class OXSheet
{
private readonly int _colCount;
private readonly int _rowCount;
private readonly string _range;
private Sheet _sheet;
private WorksheetPart _worksheetPart;
private List<UInt32> _numberFormatId;
private bool _isVisible = false;
private bool _hasValues = false;
List<string> _sharedStringTablePart;
public OXSheet(WorksheetPart worksheetPart, Stylesheet stylesheet,
List<string> sharedStringTablePart,
Sheet sheet)
{
_sheet = sheet;
_sharedStringTablePart = sharedStringTablePart;
_worksheetPart = worksheetPart;
//memory consumes more
//reades sheet<number>.xml file ,loads in the memory.
_range = GetRange(SheetUri);
//_range = _worksheetPart.Worksheet.SheetDimension.Reference.Value;
//-------------------
if (sheet.State == null)
{
_isVisible = true;
_hasValues = true;
}
else
{
_hasValues = sheet.State.HasValue;
}
string range = _range;
range = range.Substring(range.IndexOf(":") + 1);
_rowCount = Convert.ToInt32(Regex.Match(range, @"\d+").Value);
_colCount = ColumnNameToNumber(Regex.Match(range, @"\D+").Value);
OpenXmlElementList oxe = stylesheet.CellFormats.ChildElements;
_numberFormatId = new List<UInt32>();
for (int i = 0; i < oxe.Count; i++)
{
_numberFormatId.Add(new CellFormat(oxe[i].OuterXml).NumberFormatId);
}
}
private string GetRange(string sheetUri)
{
Package wdPackage = OXExcel._spreadSheetDocument.Package;
List<PackagePart> pp = wdPackage.GetParts().Where(p => p.Uri.ToString().EndsWith("/" + sheetUri)).ToList();
string range = "";
if (pp.Count == 1)
{
Uri documentUri = PackUriHelper.ResolvePartUri(new Uri("/", UriKind.Relative), pp[0].Uri);
PackagePart documentPart = wdPackage.GetPart(documentUri);
//Load the document XML in the part into an XDocument instance.
byte[] buffer = new byte[1000];
Regex rgx = new Regex(@"(?<=<(x:)?dimension( )+ref=).+?>");
int bytesRead = 0;
Stream stream = documentPart.GetStream();
while ((bytesRead = stream.Read(buffer, 0, buffer.Length)) > 0)
{
string s = Encoding.UTF8.GetString(buffer, 0, bytesRead);
range = rgx.Match(s).Value;
if (range.Length > 0) break;
}
}
if (string.IsNullOrEmpty(range))
{
_log.Error("Could not found SheetDimension reference range");
throw new Exception("Could not found SheetDimension reference range");
}
_log.Debug("[RETURNS][GetRange -CONSTRUCTOR] range:" + range);
return range.Replace(">", "").Replace("/", "").Replace("\"", "").Trim();
}
/// <summary>
/// Sheet Id of the Sheet.
/// </summary>
public Int32 SheetId
{
get
{
return Convert.ToInt32(this.Sheet.SheetId.ToString()) - 1;
}
}
public string SheetUri
{
get
{
return this.Sheet.Id.Value.Replace("rId", "sheet") + ".xml";
}
}
/// <summary>
/// Name of the Sheet.
/// </summary>
public string Name
{
get
{
return this.Sheet.Name;
}
}
/// <summary>
/// Returns the list of OXcell object.
/// OXcell object has the cell information like value,format Id,row/column positions.
/// </summary>
/// <returns></returns>
public List<OXCell> GetCells()
{
List<OXCell> oxCell = new List<OXCell>();
try
{
var qry = _worksheetPart.Worksheet.Descendants<Cell>().Where(cl => cl.CellValue != null);
qry.ToList().ForEach(cl => oxCell.Add(
new OXCell(cl/*PARAM 1*/,
_sharedStringTablePart/*PARAM 2*/,
_numberFormatId/*PARAM 3*/)));
}
finally
{
}
return oxCell;
}
/// <summary>
/// Returns the Open XML Doc Sheet object.
/// </summary>
internal Sheet Sheet
{
get
{
return _sheet;
}
}
/// <summary>
/// Returns the number of rows present.
/// </summary>
public int RowCount
{
get
{
return _rowCount;
}
}
/// <summary>
/// Returns the number of Columns present.
/// </summary>
public int ColCount
{
get
{
return _colCount;
}
}
/// <summary>
/// From the Range it gets the Column Name.
/// </summary>
/// <param name="cellName"></param>
/// <returns></returns>
private string ColumnName(string cellName)
{
// Create a regular expression to match the column name portion of the cell name.
Regex regex = new Regex("[A-Za-z]+");
Match match = regex.Match(cellName);
return match.Value;
}
/// <summary>
/// Returns number of Alphabetic count for given column name(alphabetic)
/// </summary>
/// <param name="col_name"></param>
/// <returns></returns>
private int ColumnNameToNumber(string col_name)
{
int result = 0;
// Process each letter.
for (int i = 0; i < col_name.Length; i++)
{
result *= 26;
char letter = col_name[i];
// See if it's out of bounds.
if (letter < 'A') letter = 'A';
if (letter > 'Z') letter = 'Z';
// Add in the value of this letter.
result += (int)letter - (int)'A' + 1;
}
return result;
}
/// <summary>
/// Property to returns the Sheet Visibility.
/// True: Sheet is Visible.
/// False: Sheet is Hidden/Very Hidden.
/// </summary>
public bool Visiblity
{
get
{
return _isVisible;
}
}
/// <summary>
/// If the sheet has the values it returns true else false.
/// </summary>
public bool HasValue
{
get
{
return _hasValues;
}
}
}
public class OXCell
{
[DllImport("oleaut32.dll")]
public static extern int VarFormat(
ref object o,
[MarshalAs(UnmanagedType.BStr)]
string format,
int firstDay,
int firstWeek,
uint flags,
[MarshalAs(UnmanagedType.BStr)]
ref string output);
//private Cell _cell;
private readonly string _value;
private readonly uint _formatCode;
private readonly string _formatValue;
private readonly string _range;
private readonly string _colName;
private readonly int _rowIdx;
private readonly string _org_value;
private EnumValue<CellValues> _dataType;
/// <summary>
/// Constructor.
/// </summary>
/// <param name="sheet"></param>
/// <param name="cell"></param>
public OXCell(Cell cell, List<string> sharedStringTablePart, List<UInt32> numberFormatId)
{
//_cell = cell;
_range = cell.CellReference;
string rng = _range;
_rowIdx = Convert.ToInt32(Regex.Match(rng, @"\d+").Value);
_colName = Regex.Match(rng, @"\D+").Value;
_dataType = cell.DataType;
UInt32 n = 0;
CellType ct = cell;
object value = null;
string v = cell.CellValue.InnerText;
if (cell.DataType != null && cell.DataType.Value == CellValues.SharedString)
{
v = sharedStringTablePart[Int32.Parse(v)];
}
else
{
if (ct.StyleIndex != null)
{
n = numberFormatId[Convert.ToInt32(ct.StyleIndex.ToString())];
_formatCode = n;
}
}
_org_value = v;
if (cell.DataType == "s")
{
value = v;
}
else if ((n >= 14 && n <= 22) || (n >= 45 && n <= 47)
|| (n >= 164 && n <= 180) || n == 278 || n == 185 || n == 196 ||
n == 217 || n == 326)
/*Generally / 20% assumed and taken as date format code. But there are few which are not date instead it would be formated currency.*/
{
double res;
if (double.TryParse(v, NumberStyles.Any, CultureInfo.InvariantCulture, out res))
{
try
{
string frmt = OXExcel._formatMappings.ContainsKey(n) ? OXExcel._formatMappings[n] : null;
_formatValue = frmt;
string temp = frmt;
/* Format definition in the temp variable is cleaned against the contents with in
* Square braces'[' (which seems to be the colouring information / Format code).
* This cleaning is required to check whether the format is for date / number /others.
* If it is found as date, then only format is applied else no. */
while (!string.IsNullOrEmpty(temp) && temp.IndexOf('[') > -1 && temp.IndexOf(']') > -1)
{
temp = temp.Remove(temp.LastIndexOf('['), (temp.LastIndexOf(']') - temp.LastIndexOf('[')) + 1);
}
/*------------------------------------------------------------------------------------------------*/
if (n == 14)
{
/*Default date in the excel will be of short date. So setting this explicitly to short date.*/
value = DateTime.FromOADate(res).ToShortDateString();
}
else
{
value = DateTime.FromOADate(res);
}
/*Format apply block*/
if (cell.DataType == null &&
(!string.IsNullOrEmpty(temp) && OXExcel._rx_dateFmtChk.IsMatch(temp))/*Positive check for the date format.*/)
{
string output = null;
if (!string.IsNullOrEmpty(frmt))
{
int ret = VarFormat(ref value, frmt, 0, 0, 0, ref output);
if (ret >= 0)
{
value = output;
}
}
}
else
{
if (!string.IsNullOrEmpty(temp))
/* This check is requied when the value is date,
* do not remove. If the value is of date and its format is not present
* in the dictionary then this check will prevents from preserving date value. */
{
value = res;
}
}
}
catch
{
value = v;
}
}
else
{
value = "";
}
}
else
{
double d;
if (double.TryParse(v, NumberStyles.Any, CultureInfo.InvariantCulture, out d))
{
value = d;
}
else
{
value = "";
}
}
_value = value.ToString();
}
/// <summary>
/// Cell's column position as string.
/// </summary>
public string Column
{
get
{
return _colName;
}
}
/// <summary>
/// Cell's row position as int.
/// </summary>
public int Row
{
get
{
return _rowIdx;
}
}
/// <summary>
/// Cell's range.
/// </summary>
public string Range
{
get
{
return _range;
}
}
/// <summary>
/// Cell's Format value.
/// </summary>
public string FormatValue
{
get
{
return _formatValue;
}
}
/// <summary>
/// Cell value which is formated.
/// </summary>
public string Value
{
get
{
return _value;
}
}
/// <summary>
/// Cell's DataType.
/// </summary>
public EnumValue<CellValues> DataType
{
get
{
return _dataType;
}
}
/// <summary>
/// Cell's Format ID.
/// </summary>
public uint FormatCode
{
get
{
return _formatCode;
}
}
/// <summary>
/// Cell's Original Value (Without format).
/// </summary>
public string Org_value
{
get
{
return _org_value;
}
}
}
}
No comments:
Post a Comment