Preserve DataTypes, Handle encoding

Mar 10, 2011 at 3:17 PM

I have modified this library to preserve the datatypes in the AsDataSet in the ExcelBinaryReader and the ExcelOpenXmlReader using System.Object datatype on the columns as opposed to strings.  

Also the ExcelOpenXmlReader is not handling string encoding properly resulting in strings such as _x000D_ instead of carriage returns.

It might be a nice enhancement to incorporate this into the base library.  If interested, I'd be happy to share.

Mar 16, 2011 at 7:51 AM

Please, share your work

Mar 17, 2011 at 7:13 PM

I'd be happy to, but as this forum doesn't allow attachments, I'm not sure how.  

There are a fair number of changes to support this (and a number of fixes to convertOADateTime in both readers)

Mar 17, 2011 at 8:40 PM
Edited Mar 17, 2011 at 9:46 PM

OK, here is a unix diff between my mods and changeset 59893.  

Note that several bugs fixed including reading boolean columns in binary,  Bad datetime conversions for 1/1/1900, Missing datetime conversions, and Numeric types getting converted to datetimes, string encoding.

diff -N -r C:\dev\crow.net\CROW\Excel/Core/BinaryFormat/XlsBiffFormatString.cs Excel/Core/BinaryFormat/XlsBiffFormatString.cs
1,102d0
< using System.Text;
< 
< namespace Excel.Core.BinaryFormat
< {
< 	/// <summary>
< 	/// Represents a string value of formula
< 	/// </summary>
< 	internal class XlsBiffFormatString : XlsBiffRecord
< 	{
< 
<         private Encoding m_UseEncoding =  Encoding.Default;
< 		private string m_value = null;
< 		
< 
< 		internal XlsBiffFormatString(byte[] bytes)
< 			: this(bytes, 0)
< 		{
< 		}
< 
<         internal XlsBiffFormatString(byte[] bytes, uint offset)
< 			: base(bytes, offset)
< 		{
< 		}
< 
< 
<         /// <summary>
<         /// Encoding used to deal with strings
<         /// </summary>
<         public Encoding UseEncoding
<         {
<             get { return m_UseEncoding; }
<             set { m_UseEncoding = value; }
<         }
< 
< 		/// <summary>
< 		/// Length of the string
< 		/// </summary>
< 		public ushort Length
< 		{
< 			get
< 			{
< 			     switch (ID)
< 			     {
< 			         case BIFFRECORDTYPE.FORMAT_V23:
< 			             return base.ReadByte(0x0);
< 			         default:
< 			             return base.ReadUInt16(2);
< 			     }
< 			}
< 		}
< 
< 		/// <summary>
< 		/// String text
< 		/// </summary>
<         public string Value
<         {
<             get
<             {
<                 if (m_value == null)
<                 {
<                     switch (ID)
<                     {
<                         case BIFFRECORDTYPE.FORMAT_V23:
<                             m_value = m_UseEncoding.GetString(m_bytes, m_readoffset + 1, Length);
<                             break;
<                         case BIFFRECORDTYPE.FORMAT:
<                             var offset = m_readoffset + 5;
<                             var flags = ReadByte(3);
<                             m_UseEncoding = (flags & 0x01) == 0x01 ? Encoding.Unicode : Encoding.Default;
<                             if ((flags & 0x04) == 0x01) // asian phonetic block size
<                                 offset += 4;
<                             if ((flags & 0x08) == 0x01) // number of rtf blocks
<                                 offset += 2;
<                             m_value = m_UseEncoding.IsSingleByte ? m_UseEncoding.GetString(m_bytes, offset, Length) : m_UseEncoding.GetString(m_bytes, offset, Length*2);
< 
<                             break;
< 
< 
<                     }
<                    
< 
<                 }
<                 return m_value;
<             }
<         }
< 
<         public ushort Index
<         {
<             get
<             {
<                 switch (ID)
<                 {
<                     case BIFFRECORDTYPE.FORMAT_V23:
<                         return 0;
<                     default:
<                         return ReadUInt16(0);
< 
<                 }
<             }
<         }
< 	}
< }
\ No newline at end of file
diff -N -r C:\dev\crow.net\CROW\Excel/Core/BinaryFormat/XlsBiffRecord.cs Excel/Core/BinaryFormat/XlsBiffRecord.cs
100,101d99
<                 case BIFFRECORDTYPE.BOOLERR:
<                 case BIFFRECORDTYPE.BOOLERR_OLD:
126,128d123
<                 case BIFFRECORDTYPE.FORMAT_V23:
<                 case BIFFRECORDTYPE.FORMAT:
<                     return new XlsBiffFormatString(bytes, offset);
diff -N -r C:\dev\crow.net\CROW\Excel/Core/BinaryFormat/XlsWorkbookGlobals.cs Excel/Core/BinaryFormat/XlsWorkbookGlobals.cs
12c12
<         private readonly Dictionary<ushort, XlsBiffFormatString> m_Formats = new Dictionary<ushort, XlsBiffFormatString>();
---
> 		private readonly List<XlsBiffRecord> m_Formats = new List<XlsBiffRecord>();
74c74
<         public Dictionary<ushort, XlsBiffFormatString> Formats
---
> 		public List<XlsBiffRecord> Formats
79,80d78
<         
< 
diff -N -r C:\dev\crow.net\CROW\Excel/Core/Helpers.cs Excel/Core/Helpers.cs
3,5d2
< using System.Data;
< using System.Globalization;
< using System.Text.RegularExpressions;
48,129c45
< 	    private static Regex re = new Regex("_x([0-9A-F]{4,4})_");
< 
<         public static string ConvertEscapeChars(string input)
<         {
<             return re.Replace(input, m => (((char)UInt32.Parse(m.Groups[1].Value, NumberStyles.HexNumber))).ToString());
<         }
< 
< 	    public static object ConvertFromOATime(double value)
< 	    {
< 	        if ((value >= 0.0) && (value < 60.0))
< 	        {
< 	            value++;
< 	        }
< 	        //if (date1904)
< 	        //{
< 	        //    Value += 1462.0;
< 	        //}
< 	        return DateTime.FromOADate(value);
< 	    }
< 
<         internal static void FixDataTypes(DataSet dataset)
<         {
<             var tables = new List<DataTable>(dataset.Tables.Count);
<             bool convert = false;
<             foreach (DataTable table in dataset.Tables)
<             {
<                
<                 if ( table.Rows.Count == 0)
<                 {
<                     tables.Add(table);
<                     continue;
<                 }
<                 DataTable newTable = null;
<                 for (int i = 0; i < table.Columns.Count; i++)
<                 {
<                     Type type = null;
<                     foreach (DataRow row  in table.Rows)
<                     {
<                         if (row.IsNull(i))
<                             continue;
<                         var curType = row[i].GetType();
<                         if (curType != type)
<                         {
<                             if (type == null)
<                                 type = curType;
<                             else
<                             {
<                                 type = null;
<                                 break;
<                             }
<                         }
<                     }
<                     if (type != null)
<                     {
<                         convert = true;
<                         if (newTable == null)
<                             newTable = table.Clone();
<                         newTable.Columns[i].DataType = type;
< 
<                     }
<                 }
<                 if (newTable != null)
<                 {
<                     newTable.BeginLoadData();
<                     foreach (DataRow row in table.Rows)
<                     {
<                         newTable.ImportRow(row);
<                     }
< 
<                     newTable.EndLoadData();
<                     tables.Add(newTable);
< 
<                 }
<                 else tables.Add(table);
<             }
<             if (convert)
<             {
<                 dataset.Tables.Clear();
<                 dataset.Tables.AddRange(tables.ToArray());
<             }
<         }
<     }
---
> 	}
diff -N -r C:\dev\crow.net\CROW\Excel/ExcelBinaryReader.cs Excel/ExcelBinaryReader.cs
6d5
< using Excel.Core;
209,221c208,211
<                     case BIFFRECORDTYPE.FORMAT_V23:
< 				        {
< 				            var fmt = (XlsBiffFormatString) rec;
< 				            fmt.UseEncoding = m_encoding;
< 				            m_globals.Formats.Add((ushort) m_globals.Formats.Count, fmt);
< 				        }
<                         break;
<                     case BIFFRECORDTYPE.FORMAT:
< 				        {
< 				            var fmt = (XlsBiffFormatString) rec;
<                             m_globals.Formats.Add(fmt.Index, fmt);
< 				        }
< 				        break;
---
> 					case BIFFRECORDTYPE.FORMAT:
> 					case BIFFRECORDTYPE.FORMAT_V23:
> 						m_globals.Formats.Add(rec);
> 						break;
373c363
< 								table.Columns.Add(m_cellsValues[i].ToString(), typeof(Object));
---
> 								table.Columns.Add(m_cellsValues[i].ToString());
375c365
<                                 table.Columns.Add(string.Concat(COLUMN, i), typeof(Object));
---
> 								table.Columns.Add(string.Concat(COLUMN, i));
382c372
<                             table.Columns.Add(null, typeof(Object));
---
> 							table.Columns.Add();
410,417d399
<                 case BIFFRECORDTYPE.BOOLERR:
<                     if (cell.ReadByte(7) == 0)
<                         m_cellsValues[cell.ColumnIndex] = cell.ReadByte(6) != 0;
<                     break;
<                 case BIFFRECORDTYPE.BOOLERR_OLD:
<                     if (cell.ReadByte(8) == 0)
<                         m_cellsValues[cell.ColumnIndex] = cell.ReadByte(7) != 0;
<                     break;
453,454c435
<                         _dValue = _rkCell.GetValue(j);
<                         m_cellsValues[j] = !m_ConvertOADate ? _dValue : tryConvertOADateTime(_dValue, _rkCell.GetXF(j));
---
> 						m_cellsValues[j] = _rkCell.GetValue(j);
549c530
< 		private  object tryConvertOADateTime(double value, ushort XFormat)
---
> 		private static object tryConvertOADateTime(double value, ushort XFormat)
551,653c532,561
< 		    ushort format = 0;
<             if (XFormat >= 0 && XFormat < m_globals.ExtendedFormats.Count)
<             {
<                 var rec = m_globals.ExtendedFormats[XFormat];
<                 switch (rec.ID)
<                 {
<                     case BIFFRECORDTYPE.XF_V2:
<                         format = (ushort) (rec.ReadByte(2) & 0x3F);
<                         break;
<                     case BIFFRECORDTYPE.XF_V3:
<                           if ((rec.ReadByte(3) & 4) == 0)
<                             return value.ToString();
<                         format = rec.ReadByte(1);
<                         break;
<                     case BIFFRECORDTYPE.XF_V4:
<                         if ((rec.ReadByte(5) & 4) == 0)
<                             return value.ToString();
<                         format = rec.ReadByte(1);
<                         break;
<                     
<                     default:
<                         if ((rec.ReadByte(m_globals.Sheets[m_globals.Sheets.Count-1].IsV8 ? 9 : 7) & 4) == 0)
<                             return value.ToString();
<                         
<                         format = rec.ReadUInt16(2);
<                         break;
<                 }
<             }
< 
<            
<             switch (format)
<             {
<                 // numeric built in formats
<                 case 0: //"General";
<                 case 1: //"0";
<                 case 2: //"0.00";
<                 case 3: //"#,##0";
<                 case 4: //"#,##0.00";
<                 case 5: //"\"$\"#,##0_);(\"$\"#,##0)";
<                 case 6: //"\"$\"#,##0_);[Red](\"$\"#,##0)";
<                 case 7: //"\"$\"#,##0.00_);(\"$\"#,##0.00)";
<                 case 8: //"\"$\"#,##0.00_);[Red](\"$\"#,##0.00)";
<                 case 9: //"0%";
<                 case 10: //"0.00%";
<                 case 11: //"0.00E+00";
<                 case 12: //"# ?/?";
<                 case 13: //"# ??/??";
<                 case 0x30:// "##0.0E+0";
<                
<                 case 0x25:// "_(#,##0_);(#,##0)";
<                 case 0x26:// "_(#,##0_);[Red](#,##0)";
<                 case 0x27:// "_(#,##0.00_);(#,##0.00)";
<                 case 40:// "_(#,##0.00_);[Red](#,##0.00)";
<                 case 0x29:// "_(\"$\"* #,##0_);_(\"$\"* (#,##0);_(\"$\"* \"-\"_);_(@_)";
<                 case 0x2a:// "_(\"$\"* #,##0_);_(\"$\"* (#,##0);_(\"$\"* \"-\"_);_(@_)";
<                 case 0x2b:// "_(\"$\"* #,##0.00_);_(\"$\"* (#,##0.00);_(\"$\"* \"-\"??_);_(@_)";
<                 case 0x2c:// "_(* #,##0.00_);_(* (#,##0.00);_(* \"-\"??_);_(@_)";
<                     return value;
< 
<                     // date formats
<                 case 14: //this.GetDefaultDateFormat();
<                 case 15: //"D-MM-YY";
<                 case 0x10: // "D-MMM";
<                 case 0x11: // "MMM-YY";
<                 case 0x12: // "h:mm AM/PM";
<                 case 0x13: // "h:mm:ss AM/PM";
<                 case 20: // "h:mm";
<                 case 0x15: // "h:mm:ss";
<                 case 0x16: // string.Format("{0} {1}", this.GetDefaultDateFormat(), this.GetDefaultTimeFormat());
<                    
<                 case 0x2d: // "mm:ss";
<                 case 0x2e: // "[h]:mm:ss";
<                 case 0x2f: // "mm:ss.0";
<                     return Helpers.ConvertFromOATime(value);
<                  case 0x31:// "@";
<                     return value.ToString();
< 
<                 default:
<                     XlsBiffFormatString fmtString;
<                     if (m_globals.Formats.TryGetValue(format, out fmtString) )
<                     {
<                         var fmt = fmtString.Value.ToUpper();
<                         if ((((fmt.IndexOf("D") >= 0) || (fmt.IndexOf("M") >= 0)) || ((fmt.IndexOf("YY") >= 0) || (fmt.IndexOf("H") >= 0))) || (fmt.IndexOf("SS") >= 0))
<                             return Helpers.ConvertFromOATime(value); 
< 
<                     }
<                     return value;
< 
< 
< 
<             }
< 
< 		    
< 		}
< 
< 	    private  object tryConvertOADateTime(object value, ushort XFormat)
< 	    {
< 	        double _dValue;
< 
< 
< 	        if (double.TryParse(value.ToString(), out _dValue))
< 	            return tryConvertOADateTime(_dValue, XFormat);
< 	        return _dValue;
---
> 			switch (XFormat)
> 			{
> 				//Time format
> 				case 63:
> 				case 68:
> 					DateTime time = DateTime.FromOADate(value);
> 
> 					return (time.Second == 0)
> 						? time.ToShortTimeString()
> 						: time.ToLongTimeString();
> 
> 				//Date Format
> 				case 26:
> 				case 62:
> 				case 64:
> 				case 67:
> 				case 69:
> 				case 70:
> 				case 100: return DateTime.FromOADate(value).ToString(System.Globalization.CultureInfo.CurrentCulture);
> 				//case 100: return DateTime.FromOADate(value).ToString(System.Globalization.CultureInfo.InvariantCulture);
> 
> 				default:
> 					return value;
> 			}
> 		}
> 
> 		private static object tryConvertOADateTime(object value, ushort XFormat)
> 		{
> 			double _dValue;
> 			object r;
655c563,575
< 	    }
---
> 			try
> 			{
> 				_dValue = double.Parse(value.ToString());
> 
> 				r = tryConvertOADateTime(_dValue, XFormat);
> 			}
> 			catch (FormatException)
> 			{
> 				r = value;
> 			}
> 
> 			return r;
> 		}
657c577
< 	    private bool isV8()
---
> 		private bool isV8()
686d605
<             
698,699d616
< 		    m_workbookData.AcceptChanges();
< 		    Helpers.FixDataTypes(m_workbookData);
diff -N -r C:\dev\crow.net\CROW\Excel/ExcelOpenXmlReader.cs Excel/ExcelOpenXmlReader.cs
68c68
< 				string fc = numFmt.FormatCode.ToLower();
---
> 				string fc = numFmt.FormatCode;
153,201c153,195
<                 while (_xmlReader.Read())
<                 {
<                     if (_xmlReader.Depth == 2) break;
< 
<                     if (_xmlReader.NodeType == XmlNodeType.Element)
<                     {
<                         hasValue = false;
< 
<                         if (_xmlReader.Name == XlsxWorksheet.N_c)
<                         {
<                             a_s = _xmlReader.GetAttribute(XlsxWorksheet.A_s);
<                             a_t = _xmlReader.GetAttribute(XlsxWorksheet.A_t);
<                             a_r = _xmlReader.GetAttribute(XlsxWorksheet.A_r);
<                             XlsxDimension.XlsxDim(a_r, out col, out row);
<                         }
<                         else if (_xmlReader.Name == XlsxWorksheet.N_v)
<                         {
<                             hasValue = true;
<                         }
<                     }
< 
<                     if (_xmlReader.NodeType == XmlNodeType.Text && hasValue)
<                     {
<                         double number;
<                         object o = _xmlReader.Value;
<                         if (double.TryParse(o.ToString(), out number))                        
<                             o = number;
< 
< 
<                         if (null != a_t && a_t == XlsxWorksheet.A_s)
<                         {
<                             o = Helpers.ConvertEscapeChars(_workbook.SST[(int)o]);
<                         }
<                         else if (null != a_s)
<                         {
<                             XlsxXf xf = _workbook.Styles.CellXfs[int.Parse(a_s)];
<                             if (xf.ApplyNumberFormat && o != null && o.ToString() != string.Empty && IsDateTimeStyle(xf.NumFmtId))
<                                 o = Helpers.ConvertFromOATime(number);
<                             else if (xf.NumFmtId == 49)
<                                 o = o.ToString();
<                         }
<                         else if (a_t == "b") //boolean
<                             o = _xmlReader.Value == "1";                          
< 
< 
<                         if (col - 1 < _cellsValues.Length)
<                             _cellsValues[col - 1] = o;
<                     }
<                 }
---
>  				while (_xmlReader.Read())
>  				{
>  					if (_xmlReader.Depth == 2) break;
>  
> 					if (_xmlReader.NodeType == XmlNodeType.Element)
>  					{
> 						hasValue = false;
>  
> 						if (_xmlReader.Name == XlsxWorksheet.N_c)
> 						{
> 							a_s = _xmlReader.GetAttribute(XlsxWorksheet.A_s);
> 							a_t = _xmlReader.GetAttribute(XlsxWorksheet.A_t);
> 							a_r = _xmlReader.GetAttribute(XlsxWorksheet.A_r);
> 							XlsxDimension.XlsxDim(a_r, out col, out row);
> 						}
> 						else if (_xmlReader.Name == XlsxWorksheet.N_v)
> 						{
> 							hasValue = true;
> 						}
> 					}
>  
> 					if (_xmlReader.NodeType == XmlNodeType.Text && hasValue)
> 					{
> 						object o = _xmlReader.Value;
>  
>  						if (null != a_t && a_t == XlsxWorksheet.A_s)
>  						{
> 							o = _workbook.SST[Convert.ToInt32(o)];
> 						}
> 						else if (null != a_s)
> 						{
> 							XlsxXf xf = _workbook.Styles.CellXfs[int.Parse(a_s)];
> 
> 							if (xf.ApplyNumberFormat && IsDateTimeStyle(xf.NumFmtId) && o != null && o.ToString() != string.Empty)
> 							{
> 								o = DateTime.FromOADate(Convert.ToDouble(o, CultureInfo.InvariantCulture));
> 							}
> 						}
> 
> 						if (col - 1 < _cellsValues.Length)
> 							_cellsValues[col - 1] = o;
> 					}
> 				}
208c202,203
< 				_depth++;
---
> 				else
> 					_depth++;
275d269
< 			  
289c283
<                         table.Columns.Add(null, typeof(Object));
---
> 						table.Columns.Add();
297c291
<                             table.Columns.Add(_cellsValues[index].ToString(), typeof(Object));
---
> 							table.Columns.Add(_cellsValues[index].ToString());
299c293
<                             table.Columns.Add(string.Concat(COLUMN, index), typeof(Object));
---
> 							table.Columns.Add(string.Concat(COLUMN, index));
303,304d296
<                 
<                 table.BeginLoadData();
313d304
<                 table.EndLoadData();
315,316c306
<             dataset.AcceptChanges();
< 		    Helpers.FixDataTypes(dataset);
---
> 

Dec 11, 2011 at 10:52 PM

Hi,

My problem is that the currency values are recognized as a DateTime. For example: $43.08 will be "2/11/1900 1:55:20 AM". Do you think this patch resolve this issue? If it is, could you please send me your version. My email address is: kardkovacsi@gmail.com

Thx.

Dec 28, 2011 at 8:29 PM

Attach an example to http://exceldatareader.codeplex.com/workitem/8536 and I'll take a look.

Jun 28, 2012 at 5:10 AM

Hi,

    Please say me where to download your code to avoid date-time issue.