Skip to content

Commit 82c315f

Browse files
Prashanth Govindarajanmsftbot[bot]
Prashanth Govindarajan
authored and
msftbot[bot]
committed
Append rows to a DataFrame (dotnet#2823)
* Append rows to a DataFrame * Unit test * Update unit tests and doc * Need to perfrom a type check every time * sq * Update unit test * Address comments
1 parent 70bb9e9 commit 82c315f

File tree

3 files changed

+176
-60
lines changed

3 files changed

+176
-60
lines changed

src/Microsoft.Data.Analysis/DataFrame.IO.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ public static DataFrame LoadCsv(Stream csvStream,
271271
}
272272
else
273273
{
274-
ret.Append(spl);
274+
ret.Append(spl, inPlace: true);
275275
}
276276
++rowline;
277277
line = streamReader.ReadLine();

src/Microsoft.Data.Analysis/DataFrame.cs

Lines changed: 68 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -424,56 +424,76 @@ private void ResizeByOneAndAppend(DataFrameColumn column, object value)
424424
}
425425

426426
/// <summary>
427-
/// Appends a row inplace to the DataFrame
427+
/// Appends rows to the DataFrame
428+
/// </summary>
429+
/// <remarks>If an input column's value doesn't match a DataFrameColumn's data type, a conversion will be attempted</remarks>
430+
/// <remarks>If a <seealso cref="DataFrameRow"/> in <paramref name="rows"/> is null, a null value is appended to each column</remarks>
431+
/// <param name="rows">The rows to be appended to this DataFrame </param>
432+
/// <param name="inPlace">If set, appends <paramref name="rows"/> in place. Otherwise, a new DataFrame is returned with the <paramref name="rows"/> appended</param>
433+
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
434+
{
435+
DataFrame ret = inPlace ? this : Clone();
436+
foreach (DataFrameRow row in rows)
437+
{
438+
ret.Append(row, inPlace: true);
439+
}
440+
return ret;
441+
}
442+
443+
/// <summary>
444+
/// Appends a row to the DataFrame
428445
/// </summary>
429446
/// <remarks>If a column's value doesn't match its column's data type, a conversion will be attempted</remarks>
430447
/// <remarks>If <paramref name="row"/> is null, a null value is appended to each column</remarks>
431448
/// <param name="row"></param>
432-
public void Append(IEnumerable<object> row = null)
449+
/// <param name="inPlace">If set, appends a <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
450+
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
433451
{
434-
IEnumerator<DataFrameColumn> columnEnumerator = Columns.GetEnumerator();
452+
DataFrame ret = inPlace ? this : Clone();
453+
IEnumerator<DataFrameColumn> columnEnumerator = ret.Columns.GetEnumerator();
435454
bool columnMoveNext = columnEnumerator.MoveNext();
436455
if (row != null)
437456
{
438-
// Go through row first to make sure there are no data type incompatibilities
439-
IEnumerator<object> rowEnumerator = row.GetEnumerator();
440-
bool rowMoveNext = rowEnumerator.MoveNext();
441-
List<object> cachedObjectConversions = new List<object>();
442-
while (columnMoveNext && rowMoveNext)
443-
{
444-
DataFrameColumn column = columnEnumerator.Current;
445-
object value = rowEnumerator.Current;
446-
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
447-
if (value is string stringValue && string.IsNullOrEmpty(stringValue) && column.DataType != typeof(string))
448-
{
449-
value = null;
450-
}
451-
if (value != null)
457+
// Go through row first to make sure there are no data type incompatibilities
458+
IEnumerator<object> rowEnumerator = row.GetEnumerator();
459+
bool rowMoveNext = rowEnumerator.MoveNext();
460+
List<object> cachedObjectConversions = new List<object>();
461+
while (columnMoveNext && rowMoveNext)
452462
{
453-
value = Convert.ChangeType(value, column.DataType);
454-
if (value is null)
463+
DataFrameColumn column = columnEnumerator.Current;
464+
object value = rowEnumerator.Current;
465+
// StringDataFrameColumn can accept empty strings. The other columns interpret empty values as nulls
466+
if (value is string stringValue && string.IsNullOrEmpty(stringValue) && column.DataType != typeof(string))
455467
{
456-
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
468+
value = null;
457469
}
470+
if (value != null)
471+
{
472+
value = Convert.ChangeType(value, column.DataType);
473+
if (value is null)
474+
{
475+
throw new ArgumentException(string.Format(Strings.MismatchedValueType, column.DataType), value.GetType().ToString());
476+
}
477+
}
478+
cachedObjectConversions.Add(value);
479+
columnMoveNext = columnEnumerator.MoveNext();
480+
rowMoveNext = rowEnumerator.MoveNext();
458481
}
459-
cachedObjectConversions.Add(value);
460-
columnMoveNext = columnEnumerator.MoveNext();
461-
rowMoveNext = rowEnumerator.MoveNext();
462-
}
463-
if (rowMoveNext)
464-
{
465-
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
466-
}
467-
columnEnumerator.Reset();
482+
if (rowMoveNext)
483+
{
484+
throw new ArgumentException(string.Format(Strings.ExceedsNumberOfColumns, Columns.Count), nameof(row));
485+
}
486+
// Reset the enumerators
487+
columnEnumerator = ret.Columns.GetEnumerator();
468488
columnMoveNext = columnEnumerator.MoveNext();
469-
rowEnumerator.Reset();
489+
rowEnumerator = row.GetEnumerator();
470490
rowMoveNext = rowEnumerator.MoveNext();
471491
int cacheIndex = 0;
472492
while (columnMoveNext && rowMoveNext)
473493
{
474494
DataFrameColumn column = columnEnumerator.Current;
475495
object value = cachedObjectConversions[cacheIndex];
476-
ResizeByOneAndAppend(column, value);
496+
ret.ResizeByOneAndAppend(column, value);
477497
columnMoveNext = columnEnumerator.MoveNext();
478498
rowMoveNext = rowEnumerator.MoveNext();
479499
cacheIndex++;
@@ -483,19 +503,22 @@ public void Append(IEnumerable<object> row = null)
483503
{
484504
// Fill the remaining columns with null
485505
DataFrameColumn column = columnEnumerator.Current;
486-
ResizeByOneAndAppend(column, null);
506+
ret.ResizeByOneAndAppend(column, null);
487507
columnMoveNext = columnEnumerator.MoveNext();
488508
}
489-
Columns.RowCount++;
509+
ret.Columns.RowCount++;
510+
return ret;
490511
}
491512

492513
/// <summary>
493-
/// Appends a row inplace by enumerating column names and values from <paramref name="row"/>
514+
/// Appends a row by enumerating column names and values from <paramref name="row"/>
494515
/// </summary>
495516
/// <remarks>If a column's value doesn't match its column's data type, a conversion will be attempted</remarks>
496-
/// <param name="row"></param>
497-
public void Append(IEnumerable<KeyValuePair<string, object>> row)
517+
/// <param name="row">An enumeration of column name and value to be appended</param>
518+
/// <param name="inPlace">If set, appends <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
519+
public DataFrame Append(IEnumerable<KeyValuePair<string, object>> row, bool inPlace = false)
498520
{
521+
DataFrame ret = inPlace ? this : Clone();
499522
if (row == null)
500523
{
501524
throw new ArgumentNullException(nameof(row));
@@ -505,13 +528,13 @@ public void Append(IEnumerable<KeyValuePair<string, object>> row)
505528
foreach (KeyValuePair<string, object> columnAndValue in row)
506529
{
507530
string columnName = columnAndValue.Key;
508-
int index = Columns.IndexOf(columnName);
531+
int index = ret.Columns.IndexOf(columnName);
509532
if (index == -1)
510533
{
511534
throw new ArgumentException(Strings.InvalidColumnName, nameof(columnName));
512535
}
513536

514-
DataFrameColumn column = Columns[index];
537+
DataFrameColumn column = ret.Columns[index];
515538
object value = columnAndValue.Value;
516539
if (value != null)
517540
{
@@ -528,22 +551,23 @@ public void Append(IEnumerable<KeyValuePair<string, object>> row)
528551
foreach (KeyValuePair<string, object> columnAndValue in row)
529552
{
530553
string columnName = columnAndValue.Key;
531-
int index = Columns.IndexOf(columnName);
554+
int index = ret.Columns.IndexOf(columnName);
532555

533-
DataFrameColumn column = Columns[index];
556+
DataFrameColumn column = ret.Columns[index];
534557
object value = cachedObjectConversions[cacheIndex];
535-
ResizeByOneAndAppend(column, value);
558+
ret.ResizeByOneAndAppend(column, value);
536559
cacheIndex++;
537560
}
538561

539-
foreach (DataFrameColumn column in Columns)
562+
foreach (DataFrameColumn column in ret.Columns)
540563
{
541564
if (column.Length == Rows.Count)
542565
{
543-
ResizeByOneAndAppend(column, null);
566+
ret.ResizeByOneAndAppend(column, null);
544567
}
545568
}
546-
Columns.RowCount++;
569+
ret.Columns.RowCount++;
570+
return ret;
547571
}
548572

549573
/// <summary>

tests/Microsoft.Data.Analysis.Tests/DataFrameTests.cs

Lines changed: 107 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,84 +1947,176 @@ public void TestMutationOnRows()
19471947
}
19481948
}
19491949

1950+
[Fact]
1951+
public void TestAppendRows()
1952+
{
1953+
DataFrame df = MakeDataFrame<float, bool>(10);
1954+
DataFrame df2 = MakeDataFrame<int, bool>(5);
1955+
Assert.Equal(10, df.Rows.Count);
1956+
Assert.Equal(1, df.Columns[0].NullCount);
1957+
Assert.Equal(1, df.Columns[1].NullCount);
1958+
1959+
DataFrame ret = df.Append(df2.Rows, inPlace: false);
1960+
Assert.Equal(10, df.Rows.Count);
1961+
Assert.Equal(1, df.Columns[0].NullCount);
1962+
Assert.Equal(1, df.Columns[1].NullCount);
1963+
1964+
Verify(ret, df, df2);
1965+
1966+
void Verify(DataFrame ret, DataFrame check1, DataFrame check2)
1967+
{
1968+
Assert.Equal(15, ret.Rows.Count);
1969+
Assert.Equal(2, ret.Columns[0].NullCount);
1970+
Assert.Equal(2, ret.Columns[1].NullCount);
1971+
for (long i = 0; i < ret.Rows.Count; i++)
1972+
{
1973+
DataFrameRow row = ret.Rows[i];
1974+
for (int j = 0; j < check1.Columns.Count; j++)
1975+
{
1976+
if (i < check1.Rows.Count)
1977+
{
1978+
Assert.Equal(row[j], check1.Rows[i][j]);
1979+
}
1980+
else
1981+
{
1982+
Assert.Equal(row[j]?.ToString(), (check2.Rows[i - check1.Rows.Count][j])?.ToString());
1983+
}
1984+
}
1985+
}
1986+
}
1987+
1988+
DataFrame dfClone = df.Clone();
1989+
df.Append(df2.Rows, inPlace: true);
1990+
Verify(df, dfClone, df2);
1991+
}
1992+
19501993
[Fact]
19511994
public void TestAppendRow()
19521995
{
19531996
DataFrame df = MakeDataFrame<int, bool>(10);
1954-
df.Append(new List<object> { 5, true });
1997+
df.Append(new List<object> { 5, true }, inPlace: true);
19551998
Assert.Equal(11, df.Rows.Count);
19561999
Assert.Equal(1, df.Columns[0].NullCount);
19572000
Assert.Equal(1, df.Columns[1].NullCount);
19582001

1959-
df.Append(new List<object> { 100 });
2002+
DataFrame ret = df.Append(new List<object> { 5, true });
2003+
Assert.Equal(12, ret.Rows.Count);
2004+
Assert.Equal(1, ret.Columns[0].NullCount);
2005+
Assert.Equal(1, ret.Columns[1].NullCount);
2006+
2007+
df.Append(new List<object> { 100 }, inPlace: true);
19602008
Assert.Equal(12, df.Rows.Count);
19612009
Assert.Equal(1, df.Columns[0].NullCount);
19622010
Assert.Equal(2, df.Columns[1].NullCount);
19632011

1964-
df.Append(new List<object> { null, null });
2012+
ret = df.Append(new List<object> { 100 }, inPlace: false);
2013+
Assert.Equal(13, ret.Rows.Count);
2014+
Assert.Equal(1, ret.Columns[0].NullCount);
2015+
Assert.Equal(3, ret.Columns[1].NullCount);
2016+
2017+
df.Append(new List<object> { null, null }, inPlace: true);
19652018
Assert.Equal(13, df.Rows.Count);
19662019
Assert.Equal(2, df.Columns[0].NullCount);
19672020
Assert.Equal(3, df.Columns[1].NullCount);
2021+
ret = df.Append(new List<object> { null, null }, inPlace: false);
2022+
Assert.Equal(14, ret.Rows.Count);
2023+
Assert.Equal(3, ret.Columns[0].NullCount);
2024+
Assert.Equal(4, ret.Columns[1].NullCount);
19682025

1969-
df.Append(new Dictionary<string, object> { { "Column1", (object)5 } , { "Column2", false } });
2026+
df.Append(new Dictionary<string, object> { { "Column1", (object)5 }, { "Column2", false } }, inPlace: true);
19702027
Assert.Equal(14, df.Rows.Count);
19712028
Assert.Equal(2, df.Columns[0].NullCount);
19722029
Assert.Equal(3, df.Columns[1].NullCount);
2030+
ret = df.Append(new Dictionary<string, object> { { "Column1", (object)5 }, { "Column2", false } }, inPlace: false);
2031+
Assert.Equal(15, ret.Rows.Count);
2032+
Assert.Equal(2, ret.Columns[0].NullCount);
2033+
Assert.Equal(3, ret.Columns[1].NullCount);
19732034

1974-
df.Append(new Dictionary<string, object> { { "Column1", 5 } });
2035+
df.Append(new Dictionary<string, object> { { "Column1", 5 } }, inPlace: true);
19752036
Assert.Equal(15, df.Rows.Count);
19762037

19772038
Assert.Equal(15, df["Column1"].Length);
19782039
Assert.Equal(15, df["Column2"].Length);
19792040
Assert.Equal(2, df.Columns[0].NullCount);
19802041
Assert.Equal(4, df.Columns[1].NullCount);
2042+
ret = df.Append(new Dictionary<string, object> { { "Column1", 5 } }, inPlace: false);
2043+
Assert.Equal(16, ret.Rows.Count);
19812044

1982-
df.Append(new Dictionary<string, object> { { "Column2", false } });
2045+
Assert.Equal(16, ret["Column1"].Length);
2046+
Assert.Equal(16, ret["Column2"].Length);
2047+
Assert.Equal(2, ret.Columns[0].NullCount);
2048+
Assert.Equal(5, ret.Columns[1].NullCount);
2049+
2050+
df.Append(new Dictionary<string, object> { { "Column2", false } }, inPlace: true);
19832051
Assert.Equal(16, df.Rows.Count);
19842052
Assert.Equal(16, df["Column1"].Length);
19852053
Assert.Equal(16, df["Column2"].Length);
19862054
Assert.Equal(3, df.Columns[0].NullCount);
19872055
Assert.Equal(4, df.Columns[1].NullCount);
1988-
1989-
df.Append((IEnumerable<object>)null);
2056+
ret = df.Append(new Dictionary<string, object> { { "Column2", false } }, inPlace: false);
2057+
Assert.Equal(17, ret.Rows.Count);
2058+
Assert.Equal(17, ret["Column1"].Length);
2059+
Assert.Equal(17, ret["Column2"].Length);
2060+
Assert.Equal(4, ret.Columns[0].NullCount);
2061+
Assert.Equal(4, ret.Columns[1].NullCount);
2062+
2063+
df.Append((IEnumerable<object>)null, inPlace: true);
19902064
Assert.Equal(17, df.Rows.Count);
19912065
Assert.Equal(17, df["Column1"].Length);
19922066
Assert.Equal(17, df["Column2"].Length);
19932067
Assert.Equal(4, df.Columns[0].NullCount);
19942068
Assert.Equal(5, df.Columns[1].NullCount);
2069+
ret = df.Append((IEnumerable<object>)null, inPlace: false);
2070+
Assert.Equal(18, ret.Rows.Count);
2071+
Assert.Equal(18, ret["Column1"].Length);
2072+
Assert.Equal(18, ret["Column2"].Length);
2073+
Assert.Equal(5, ret.Columns[0].NullCount);
2074+
Assert.Equal(6, ret.Columns[1].NullCount);
19952075

19962076
// DataFrame must remain usable even if Append throws
1997-
Assert.Throws<FormatException>(() => df.Append(new List<object> { 5, "str" }));
1998-
Assert.Throws<FormatException>(() => df.Append(new Dictionary<string, object> { { "Column2", "str" } }));
1999-
Assert.Throws<ArgumentException>(() => df.Append(new List<object> { 5, true, true }));
2077+
Assert.Throws<FormatException>(() => df.Append(new List<object> { 5, "str" }, inPlace: true));
2078+
Assert.Throws<FormatException>(() => df.Append(new Dictionary<string, object> { { "Column2", "str" } }, inPlace: true));
2079+
Assert.Throws<ArgumentException>(() => df.Append(new List<object> { 5, true, true }, inPlace: true));
2080+
2081+
df.Append(inPlace: true);
2082+
Assert.Equal(18, df.Rows.Count);
2083+
Assert.Equal(18, df["Column1"].Length);
2084+
Assert.Equal(18, df["Column2"].Length);
2085+
Assert.Equal(5, df.Columns[0].NullCount);
2086+
Assert.Equal(6, df.Columns[1].NullCount);
20002087

2001-
df.Append();
2088+
ret = df.Append(inPlace: false);
20022089
Assert.Equal(18, df.Rows.Count);
20032090
Assert.Equal(18, df["Column1"].Length);
20042091
Assert.Equal(18, df["Column2"].Length);
20052092
Assert.Equal(5, df.Columns[0].NullCount);
20062093
Assert.Equal(6, df.Columns[1].NullCount);
2094+
Assert.Equal(19, ret.Rows.Count);
2095+
Assert.Equal(19, ret["Column1"].Length);
2096+
Assert.Equal(19, ret["Column2"].Length);
2097+
Assert.Equal(6, ret.Columns[0].NullCount);
2098+
Assert.Equal(7, ret.Columns[1].NullCount);
20072099
}
20082100

20092101
[Fact]
20102102
public void TestAppendEmptyValue()
20112103
{
20122104
DataFrame df = MakeDataFrame<int, bool>(10);
2013-
df.Append(new List<object> { "", true });
2105+
df.Append(new List<object> { "", true }, inPlace: true);
20142106
Assert.Equal(11, df.Rows.Count);
20152107
Assert.Equal(2, df.Columns[0].NullCount);
20162108
Assert.Equal(1, df.Columns[1].NullCount);
20172109

20182110
StringDataFrameColumn column = new StringDataFrameColumn("Strings", Enumerable.Range(0, 11).Select(x => x.ToString()));
20192111
df.Columns.Add(column);
20202112

2021-
df.Append(new List<object> { 1, true, "" });
2113+
df.Append(new List<object> { 1, true, "" }, inPlace: true);
20222114
Assert.Equal(12, df.Rows.Count);
20232115
Assert.Equal(2, df.Columns[0].NullCount);
20242116
Assert.Equal(1, df.Columns[1].NullCount);
20252117
Assert.Equal(0, df.Columns[2].NullCount);
20262118

2027-
df.Append(new List<object> { 1, true, null });
2119+
df.Append(new List<object> { 1, true, null }, inPlace: true);
20282120
Assert.Equal(13, df.Rows.Count);
20292121
Assert.Equal(1, df.Columns[2].NullCount);
20302122
}

0 commit comments

Comments
 (0)