Excel Interop

Generic List converter to multidimensional (2D) string array (for excel data population)

Posted on Updated on

This post is still related to the excel data population that I posted 2 weeks back where I concluded that the most performant way for the data population is using multidimensional string array (multi cells & rows).

Let’s first take a look at the SampleObject class that we use to populate the data into excel

class SampleObject
{
    public string A { get; set; }
    public string B { get; set; }
    public string C { get; set; }
    public string D { get; set; }
    public string E { get; set; }
}

Now in the previous post, the data population is being done “statically” compiled, means it only supports for that exact SampleObject type/class.

static void PopulateItemsWithMultiCellsAndRowsMethod(Worksheet worksheet, IList<SampleObject> items)
{
    LogFormat("Start populating {0} rows using multi cells and rows method", items.Count.ToString());
    var stringArray = new string[items.Count, 5];
    for (int i = 0; i < items.Count; i++)
    {
        var item = items[i];
        var rowIndex = i + 1;
        stringArray[i, 0] = item.A;
        stringArray[i, 1] = item.B;
        stringArray[i, 2] = item.C;
        stringArray[i, 3] = item.D;
        stringArray[i, 4] = item.E;
    }
 
    var r = worksheet.get_Range(string.Format("A{0}:E{1}", 1, items.Count));
    r.Value = stringArray;
    DisposeCOMObject(r);
}

The above will work just fine until there are other “SampleObjectX”s that need to be supported, so it will be more convenient if we can “dynamically” do this for any given objects.

One obvious answer will be using Reflection but there is also another alternative which is generating dynamic methods using Expression Tree. There are pros and cons between the 2 options above (nicely explained by this post).

In below code snippet, I summarized the 3 approaches

1. Statically

string[,] ConvertTo2DStringArrayStatically(IList<SampleObject> items)
{   
    var stringArray = new string[items.Count, 5];
    for (int i = 0; i < items.Count; i++)
    {
        var item = items[i];
        stringArray[i, 0] = item.A;
        stringArray[i, 1] = item.B;
        stringArray[i, 2] = item.C;
        stringArray[i, 3] = item.D;
        stringArray[i, 4] = item.E;
    }
    return stringArray;
}

2. With reflection

string[,] ConvertTo2DStringArrayWithReflection<T>(IList<T> items)
{
    var props = typeof(T).GetProperties();
    var stringArray = new string[items.Count, props.Length];

    for (int i = 0; i < items.Count; i++)
    {
        var item = items[i];
        for (int iProp = 0; iProp < props.Length; iProp++)
        {
            stringArray[i, iProp] = props[iProp].GetValue(item).ToString();
        }
    }
    return stringArray;
}

3. With Expression Tree (no cache is added intentionally for brevity purpose)

Func<IList<T>, string[,]> ConvertTo2DStringArrayWithExpressionTree<T>()
{
    var t = typeof(T);
    var tProperties = t.GetProperties();

    var statements = new List<Expression>();

    var listInputParameter = Expression.Parameter(typeof(IList<T>));
    var itemPropertiesCount = Expression.Constant(tProperties.Length);
    var rowCountExpression = Expression.Property(listInputParameter, typeof(ICollection<T>), "Count");
    var arrayIndexVariable = Expression.Variable(typeof(int));
    var currentItem = Expression.Variable(t);
    
    statements.Add(Expression.Call(typeof(Console).GetMethod("WriteLine", new Type[] { typeof(string), typeof(string) })
                    , Expression.Constant("Converting {0} rows with expression tree")
                    , Expression.Call(rowCountExpression, typeof(object).GetMethod("ToString"))));

    // Initialize string array based on the items row count and the 
    var stringArrayVariable = Expression.Variable(typeof(string[,]));
    var newArrayExpression = Expression.NewArrayBounds(typeof(string), rowCountExpression, itemPropertiesCount);
    var initializeArray = Expression.Assign(stringArrayVariable, newArrayExpression);
    statements.Add(initializeArray);

    // Prepare item assignments here as it requires loop on the properties which can't be done inline
    var itemStatements = new List<Expression>();
    itemStatements.Add(Expression.Assign(currentItem, Expression.Property(listInputParameter, "Item", arrayIndexVariable)));
    for (int pIndex = 0; pIndex < tProperties.Length; pIndex++)
    {
        var propExpression = Expression.Property(currentItem, tProperties[pIndex]);
        itemStatements.Add(Expression.Assign(Expression.ArrayAccess(stringArrayVariable, new List<Expression> { arrayIndexVariable, Expression.Constant(pIndex) }), propExpression));
    }
    itemStatements.Add(Expression.PostIncrementAssign(arrayIndexVariable));

    // iterate the items
    var label = Expression.Label();
    var forLoopBody = Expression.Block(
        new[] { arrayIndexVariable }, // local variable
        Expression.Assign(arrayIndexVariable, Expression.Constant(0)), // initialize with 0
        Expression.Loop(
            Expression.IfThenElse(
                Expression.LessThan(arrayIndexVariable, rowCountExpression), // test
                Expression.Block(new[] { currentItem }, itemStatements), // execute if true
                Expression.Break(label)) // execute if false
            , label));
    statements.Add(forLoopBody);

    // return statement
    statements.Add(stringArrayVariable); 

    var body = Expression.Block(stringArrayVariable.Type, new[] { stringArrayVariable }, statements.ToArray());
    var compiled = Expression.Lambda<Func<IList<T>, string[,]>>(body, listInputParameter).Compile();
    return compiled;
}

the expression tree “body” in DebugView

.Block(System.String[,] $var1) {
    .Call System.Console.WriteLine("Converting {0} rows with expression tree", .Call ($var2.Count).ToString());
    $var1 = .NewArray System.String[$var2.Count,5];
    .Block(System.Int32 $var3) {
        $var3 = 0;
        .Loop  {
            .If ($var3 < $var2.Count) {
                .Block(ExcelDataPopulator.SampleObject $var4) {
                    $var4 = $var2.Item[$var3];
                    $var1[$var3,0] = $var4.A;
                    $var1[$var3,1] = $var4.B;
                    $var1[$var3,2] = $var4.C;
                    $var1[$var3,3] = $var4.D;
                    $var1[$var3,4] = $var4.E;
                    $var3++
                }
            } .Else {
                .Break #Label1 { }
            }
        }
        .LabelTarget #Label1:
    };
    $var1
}

From the performance benchmark side,

var sw = Stopwatch.StartNew();

var items = Enumerable.Range(1, 1000000).Select(i => new SampleObject { A = "A" + i, B = "B" + i, C = "C" + i, D = "D" + i, E = "E" + i }).ToList();
string[,] stringArray = null;

LogFormat("Initialized {0} rows, {1} ms", items.Count.ToString(), sw.ElapsedMilliseconds.ToString());
sw.Restart();

for (int i = 0; i < 10; i++)
    stringArray = ConvertTo2DStringArrayStatically(items);

LogFormat("Convert with statically is completed in {0} ms", sw.ElapsedMilliseconds.ToString());
sw.Restart();

for (int i = 0; i < 10; i++)
    stringArray = ConvertTo2DStringArrayWithReflection(items);

LogFormat("Convert with reflection is completed in {0} ms", sw.ElapsedMilliseconds.ToString());
sw.Restart();

for (int i = 0; i < 10; i++)
    stringArray = ConvertTo2DStringArrayWithExpressionTree<SampleObject>()(items);

LogFormat("Convert with expression tree is completed in {0} ms", sw.ElapsedMilliseconds.ToString());

// log output:
//Initialized 1,000,000 rows, 1,747 ms
//Convert with statically is completed in 1,077 ms
//Convert with reflection is completed in 10,915 ms
//Convert with expression tree is completed in 1,617 ms

So conclusions:

  1. statically will always be the most performant but maintainability is a bit of a challenging if we have many objects/types to support.
  2. reflection is 10x slower but the code is much simpler
  3. dynamic method via expression tree is having more code to build the expressions but the performance is only slightly slower than statically compiled option.

There are couple of more options available that I didn’t cover above:

  1. Code generation using class metadata/reflection
  2. Reflection.Emit

Full source code is available at https://github.com/bembengarifin/ExcelDataPopulator)

Generate (lazy and infinite) list of excel column names sequence

Posted on Updated on

While I was looking into the excel data population, I found that in order to populate the data with different column size (fields) into excel, I will definitely require some list of excel column names for this purpose.

 

// "A" = column index (int) to populate from 
// "E" = column index (int) based on A above + total fields count
worksheet.get_Range("A1:E10000"); 

The interesting here is how the excel column names sequence is not the same as number sequence (below)

Excel Sequence

There are various ways described here which will get the job done, and excel is limited to 16,384 columns (last column name is XFD)

However, I’m keen to create a function which can generate “unlimited” sequence. I got this idea from the recent Functional Programming course where Haskell has this infinite list and lazy evaluation and in C#, this laziness can be represented with IEnumerable and yield.

Here’s the function:

/// <summary>
/// This is an infinite list of excel kind of sequence, the caller will be controlling how much of data that it really requires
/// The method signature is very similar to Enumerable.Range(start, count)
/// 
/// Whereas, the other 2 optionals parameters are not for the caller, they are only for internal recursive call
/// </summary>
/// <param name="start">The value of the first integer in the sequence.</param>
/// <param name="count">The number of sequential integers to generate.</param>
/// <param name="depth">NOT TO BE SPECIFIED DIRECTLY, used for recursive purpose</param>
/// <param name="slots">NOT TO BE SPECIFIED DIRECTLY, used for recursive purpose</param>
/// <returns></returns>
IEnumerable<int[]> Iterate(int start, int count, int depth = 1, int[] slots = null)
{
    if (slots == null) slots = new int[depth];

    for (int i = start; i < start + count; i++)
    {
        slots[depth - 1] = i;

        if (depth > 1)
            foreach (var x in Iterate(start, count, depth - 1, slots)) yield return x;
        else
            yield return slots.Reverse().ToArray();
    }

    if (slots.Length == depth)
        foreach (var x in Iterate(start, count, depth + 1, null)) yield return x;
}

Any of the code below will generate the same column names from “A” to “XFD” in 30ms on my laptop (i7 2.5Ghz Quad core)

foreach (var item in Iterate(0, 26).Take(16384))
{
    var columnName = string.Join("-", item.Select(x => (char)(x + 65)).ToArray()).Trim();
    Debug.WriteLine(columnName);
}

foreach (var item in Iterate(65, 26).Take(16384))
{
    var columnName = string.Join("-", item.Select(x => (char)x).ToArray()).Trim();
    Debug.WriteLine(columnName);
}

And the most interesting “infinite” test :)

var i = 0;
foreach (var item in Iterate(0, 26))
{
    var columnName = string.Join("-", item.Select(x => (char)(x + 65)).ToArray()).Trim();
    Debug.WriteLine("#{0} = {1}", i, columnName);
    i++;
}

/* 
#0 = A
.....
#16383 = X-F-D (excel limit)
.....
#133866109 = K-F-X-J-T-N
.....
#321272406 = A-A-A-A-A-A-A
.....
*/

Ways to populate data into excel via C# and excel interop (with performance comparison)

Posted on Updated on

Few days ago I was looking into some excel automation code where I need to fetch the data from the back end and then write/populate the data into excel.

There are 3 ways that we can populate the data into excel via excel interop.

1. Single cell

var r1 = worksheet.get_Range("A1"); // single cell
r1.Value = "A1";

2. Multi cells for 1 row

var r2 = worksheet.get_Range("A2:C2"); // multi cells in 1 row
r2.Value = new[] { "A2", "B2", "C2" };

3. Multi cells & rows

var r3 = worksheet.get_Range("A3:C4"); // multi cells & rows
r3.Value = new string[,] { { "A3", "B3", "C3" }, { "A4", "B4", "C4" } }; ;

As the work is related to large volume of data (100K rows), I looked further into the options above to find the most performant one.
So I came up with a list of 100k rows very simple object (5 properties), gave each option a run, and got the stats:

Start populating 100,000 rows using single cell method
Populate data completed in 196,206 ms

Start populating 100,000 rows using multi cells (per 1 row) method
Populate data completed in 43,039 ms

Start populating 100,000 rows using multi cells and rows method
Populate data completed in 1,028 ms

Sample output

Excel100kRows

Quite staggering differences between the options and I’m quite impressed with the excellent performance of the multi cells/rows option.

Included the code snippets below for quick view (full source code is also available at https://github.com/bembengarifin/ExcelDataPopulator)

static void PopulateItemsWithSingleCellMethod(Worksheet worksheet, IList<SampleObject> items)
{
    LogFormat("Start populating {0} rows using single cell method", items.Count.ToString());
    for (int i = 0; i < items.Count; i++)
    {
        var item = items[i];
        var rowIndex = i + 1;
        var r1 = worksheet.get_Range("A" + rowIndex); r1.Value = item.A; DisposeCOMObject(r1);
        var r2 = worksheet.get_Range("B" + rowIndex); r2.Value = item.B; DisposeCOMObject(r2);
        var r3 = worksheet.get_Range("C" + rowIndex); r3.Value = item.C; DisposeCOMObject(r3);
        var r4 = worksheet.get_Range("D" + rowIndex); r4.Value = item.D; DisposeCOMObject(r4);
        var r5 = worksheet.get_Range("E" + rowIndex); r5.Value = item.E; DisposeCOMObject(r5);
    }
}

static void PopulateItemsWithMultiCellsOneRowMethod(Worksheet worksheet, IList<SampleObject> items)
{
    LogFormat("Start populating {0} rows using multi cells (per 1 row) method", items.Count.ToString());
    for (int i = 0; i < items.Count; i++)
    {
        var item = items[i];
        var rowIndex = i + 1;
        var r = worksheet.get_Range(string.Format("A{0}:E{0}", rowIndex));
        r.Value = new[] { item.A, item.B, item.C, item.D, item.E };
        DisposeCOMObject(r);
    }
}

static void PopulateItemsWithMultiCellsAndRowsMethod(Worksheet worksheet, IList<SampleObject> items)
{
    LogFormat("Start populating {0} rows using multi cells and rows method", items.Count.ToString());
    var stringArray = new string[items.Count, 5];
    for (int i = 0; i < items.Count; i++)
    {
        var item = items[i];
        var rowIndex = i + 1;
        stringArray[i, 0] = item.A;
        stringArray[i, 1] = item.B;
        stringArray[i, 2] = item.C;
        stringArray[i, 3] = item.D;
        stringArray[i, 4] = item.E;
    }

    var r = worksheet.get_Range(string.Format("A{0}:E{1}", 1, items.Count));
    r.Value = stringArray;
    DisposeCOMObject(r);
}

Combine multiple excel files into one with C#

Posted on Updated on

I created the code below as the application that I’m working on is using different 3rd party grid controls or there are multiple grids in the report which when exported, they are being exported into different excel files and it’s much easier if we can provide 1 single file output but have multiple worksheets for users to use. And I actually re-used the existing code below for some other manual extract stuff from CSV that we need to merge into 1 excel file, worked great ;)

Have the codes below for a while, but just got some time to post this as i think it might be handy for everyone or for ownself in future in fact ;)

I guess this is the kind of code that you only want to write once and gosh look at those Type.Missing :p

    public class ExcelEngine
    {
        /// &lt;summary&gt;
        /// To combine multiple workbooks into a file
        /// &lt;/summary&gt;
        /// &lt;remarks&gt;
        /// The following file name convention will be used while combining the child files
        /// exportFileKey_[Description] where the description will be the tab name
        ///
        /// The ordering of the worksheets will be using the file creation time
        ///
        /// The above convention can be enhanced when necessary but need to make sure the backward compatibility for the existing codes
        ///
        /// Note:
        /// - the index starts from 1 in the excel automation array
        /// - be careful when making changes, especially moving things around in the method, e.g. prompts might come up unexpectedly
        /// - to avoid &quot;zombie&quot; excel instances in the task manager when referencing the COM object, please refer to the http://support.microsoft.com/default.aspx/kb/317109
        ///
        /// &lt;/remarks&gt;
        /// &lt;param name=&quot;exportFilePath&quot;&gt;the destination file name choosen by the user&lt;/param&gt;
        /// &lt;param name=&quot;exportFileKey&quot;&gt;the unique key file name choosen by the user, this is to avoid merging files with similar names&lt;/param&gt;
        /// &lt;param name=&quot;rawFilesDirectory&quot;&gt;the folder where the files are being generated, this can be temp folder or any folder basically&lt;/param&gt;
        /// &lt;param name=&quot;deleteRawFiles&quot;&gt;delete the raw files after completed?&lt;/param&gt;
        ///
        /// &lt;returns&gt;&lt;/returns&gt;
        public static bool CombineWorkBooks(string exportFilePath, string exportFileKey, string rawFilesDirectory, bool deleteRawFiles)
        {
            Application xlApp = null;
            Workbooks newBooks = null;
            Workbook newBook = null;
            Sheets newBookWorksheets = null;
            Worksheet defaultWorksheet = null;
            IEnumerable&lt;string&gt; filesToMerge = null;
            bool areRowsTruncated = false;

            try
            {
                Console.WriteLine(&quot;Method: CombineWorkBooks - Starting excel&quot;);
                xlApp = new Application();

                if (xlApp == null)
                {
                    Console.WriteLine(&quot;EXCEL could not be started. Check that your office installation and project references are correct.&quot;);
                    return false;
                }

                Console.WriteLine(&quot;Method: CombineWorkBooks - Disabling the display alerts to prevent any prompts during workbooks close&quot;);
                // not an elegant solution? however has to do this else will prompt for save on exit, even set the Saved property didn't help
                xlApp.DisplayAlerts = false;

                Console.WriteLine(&quot;Method: CombineWorkBooks - Set Visible to false as a background process, else it will be displayed in the task bar&quot;);
                xlApp.Visible = false;

                Console.WriteLine(&quot;Method: CombineWorkBooks - Create a new workbook, comes with an empty default worksheet&quot;);
                newBooks = xlApp.Workbooks;
                newBook = newBooks.Add(XlWBATemplate.xlWBATWorksheet);
                newBookWorksheets = newBook.Worksheets;

                // get the reference for the empty default worksheet
                if (newBookWorksheets.Count &gt; 0)
                {
                    defaultWorksheet = newBookWorksheets[1] as Worksheet;
                }

                Console.WriteLine(&quot;Method: CombineWorkBooks - Get the files sorted by creation date&quot;);
                var dirInfo = new DirectoryInfo(rawFilesDirectory);
                filesToMerge = from f in dirInfo.GetFiles(exportFileKey + &quot;*&quot;, SearchOption.TopDirectoryOnly)
                               orderby f.CreationTimeUtc
                               select f.FullName;

                foreach (var filePath in filesToMerge)
                {
                    Workbook childBook = null;
                    Sheets childSheets = null;
                    try
                    {
                        Console.WriteLine(&quot;Method: CombineWorkBooks - Processing {0}&quot;, filePath);
                        childBook = newBooks.Open(filePath, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing
                                                             , Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing);

                        childSheets = childBook.Worksheets;
                        if (childSheets != null)
                        {
                            for (int iChildSheet = 1; iChildSheet &lt;= childSheets.Count; iChildSheet++)
                            {
                                Worksheet sheetToCopy = null;
                                try
                                {
                                    sheetToCopy = childSheets[iChildSheet] as Worksheet;
                                    if (sheetToCopy != null)
                                    {
                                        Console.WriteLine(&quot;Method: CombineWorkBooks - Assigning the worksheet name&quot;);
                                        sheetToCopy.Name = Truncate(GetReportDescription(Path.GetFileNameWithoutExtension(filePath), sheetToCopy.Name), 31); // only 31 char max

                                        Console.WriteLine(&quot;Method: CombineWorkBooks - Copy the worksheet before the default sheet&quot;);
                                        sheetToCopy.Copy(defaultWorksheet, Type.Missing);
                                    }
                                }
                                finally
                                {
                                    DisposeCOMObject(sheetToCopy);
                                }
                            }

                            Console.WriteLine(&quot;Method: CombineWorkBooks - Close the childbook&quot;);
                            // for some reason, calling close below may cause an exception -&gt; System.Runtime.InteropServices.COMException (0x80010108): The object invoked has disconnected from its clients.
                            childBook.Close(false, Type.Missing, Type.Missing);
                        }
                    }
                    finally
                    {
                        DisposeCOMObject(childSheets);
                        DisposeCOMObject(childBook);
                    }
                }

                Console.WriteLine(&quot;Method: CombineWorkBooks - Delete the empty default worksheet&quot;);
                if (defaultWorksheet != null) defaultWorksheet.Delete();

                Console.WriteLine(&quot;Method: CombineWorkBooks - Save the new book into the export file path: {0}&quot;, exportFilePath);
                newBook.SaveAs(exportFilePath, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing
                    , XlSaveAsAccessMode.xlExclusive, Type.Missing, Type.Missing, Type.Missing, Type.Missing, Type.Missing);

                newBooks.Close();
                xlApp.DisplayAlerts = true;

                return true;
            }
            catch (Exception ex)
            {
                Console.WriteLine(&quot;Method: CombineWorkBooks - Exception: {0}&quot;, ex.ToString());
                return false;
            }
            finally
            {
                DisposeCOMObject(defaultWorksheet);
                DisposeCOMObject(newBookWorksheets);
                DisposeCOMObject(newBooks);
                DisposeCOMObject(newBook);

                Console.WriteLine(&quot;Method: CombineWorkBooks - Closing the excel app&quot;);
                if (xlApp != null)
                {
                    xlApp.Quit();
                    DisposeCOMObject(xlApp);
                }

                if (deleteRawFiles)
                {
                    Console.WriteLine(&quot;Method: CombineWorkBooks - Deleting the temporary files&quot;);
                    DeleteTemporaryFiles(filesToMerge);
                }
            }
        }

        private static void DisposeCOMObject(object o)
        {
            Console.WriteLine(&quot;Method: DisposeCOMObject - Disposing&quot;);
            if (o == null)
            {
                return;
            }
            try
            {
                System.Runtime.InteropServices.Marshal.ReleaseComObject(o);
            }
            catch (Exception ex)
            {
                Console.WriteLine(&quot;Method: DisposeCOMObject - Exception: {0}&quot;, ex.ToString());
            }
        }

        private static void DeleteTemporaryFiles(IEnumerable&lt;string&gt; tempFilenames)
        {
            foreach (var tempFile in tempFilenames)
            {
                try
                {
                    File.Delete(tempFile);
                }
                catch
                    (Exception)
                {
                    Console.WriteLine(&quot;Could not delete temporary file '{0}'&quot;, tempFilenames);
                }
            }
        }

        /// &lt;summary&gt;
        /// the first array item will be the key
        /// &lt;/summary&gt;
        /// &lt;param name=&quot;fileName&quot;&gt;&lt;/param&gt;
        /// &lt;param name=&quot;defaultName&quot;&gt;&lt;/param&gt;
        /// &lt;returns&gt;&lt;/returns&gt;
        protected static string GetReportDescription(string fileName, string defaultName)
        {
            var splits = fileName.Split('_');
            return splits.Length &gt; 1 ? string.Join(&quot;-&quot;, splits, 1, splits.Length - 1) : defaultName;
        }

        /// &lt;summary&gt;
        /// Get a substring of the first N characters.
        /// http://dotnetperls.com/truncate-string
        /// &lt;/summary&gt;
        public static string Truncate(string source, int length)
        {
            if (source.Length &gt; length)
            {
                source = source.Substring(0, length);
            }
            return source;
        }

}