5
5
using System ;
6
6
using System . Collections . Generic ;
7
7
using System . IO ;
8
- using System . Text ;
9
8
10
9
namespace Microsoft . Data . Analysis
11
10
{
@@ -104,6 +103,77 @@ public static DataFrame LoadCsv(string filename,
104
103
}
105
104
}
106
105
106
+ private static DataFrameColumn CreateColumn ( Type kind , string [ ] columnNames , int columnIndex )
107
+ {
108
+ PrimitiveDataFrameColumn < T > CreatePrimitiveDataFrameColumn < T > ( )
109
+ where T : unmanaged
110
+ {
111
+ return new PrimitiveDataFrameColumn < T > ( columnNames == null ? "Column" + columnIndex . ToString ( ) : columnNames [ columnIndex ] ) ;
112
+ }
113
+ DataFrameColumn ret ;
114
+ if ( kind == typeof ( bool ) )
115
+ {
116
+ ret = CreatePrimitiveDataFrameColumn < bool > ( ) ;
117
+ }
118
+ else if ( kind == typeof ( int ) )
119
+ {
120
+ ret = CreatePrimitiveDataFrameColumn < int > ( ) ;
121
+ }
122
+ else if ( kind == typeof ( float ) )
123
+ {
124
+ ret = CreatePrimitiveDataFrameColumn < float > ( ) ;
125
+ }
126
+ else if ( kind == typeof ( string ) )
127
+ {
128
+ ret = new StringDataFrameColumn ( columnNames == null ? "Column" + columnIndex . ToString ( ) : columnNames [ columnIndex ] , 0 ) ;
129
+ }
130
+ else if ( kind == typeof ( long ) )
131
+ {
132
+ ret = CreatePrimitiveDataFrameColumn < long > ( ) ;
133
+ }
134
+ else if ( kind == typeof ( decimal ) )
135
+ {
136
+ ret = CreatePrimitiveDataFrameColumn < decimal > ( ) ;
137
+ }
138
+ else if ( kind == typeof ( byte ) )
139
+ {
140
+ ret = CreatePrimitiveDataFrameColumn < byte > ( ) ;
141
+ }
142
+ else if ( kind == typeof ( char ) )
143
+ {
144
+ ret = CreatePrimitiveDataFrameColumn < char > ( ) ;
145
+ }
146
+ else if ( kind == typeof ( double ) )
147
+ {
148
+ ret = CreatePrimitiveDataFrameColumn < double > ( ) ;
149
+ }
150
+ else if ( kind == typeof ( sbyte ) )
151
+ {
152
+ ret = CreatePrimitiveDataFrameColumn < sbyte > ( ) ;
153
+ }
154
+ else if ( kind == typeof ( short ) )
155
+ {
156
+ ret = CreatePrimitiveDataFrameColumn < short > ( ) ;
157
+ }
158
+ else if ( kind == typeof ( uint ) )
159
+ {
160
+ ret = CreatePrimitiveDataFrameColumn < uint > ( ) ;
161
+ }
162
+ else if ( kind == typeof ( ulong ) )
163
+ {
164
+ ret = CreatePrimitiveDataFrameColumn < ulong > ( ) ;
165
+ }
166
+ else if ( kind == typeof ( ushort ) )
167
+ {
168
+ ret = CreatePrimitiveDataFrameColumn < ushort > ( ) ;
169
+ }
170
+ else
171
+ {
172
+ throw new NotSupportedException ( nameof ( kind ) ) ;
173
+ }
174
+ return ret ;
175
+ }
176
+
107
177
/// <summary>
108
178
/// Reads a seekable stream of CSV data into a DataFrame.
109
179
/// Follows pandas API.
@@ -116,7 +186,7 @@ public static DataFrame LoadCsv(string filename,
116
186
/// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
117
187
/// <param name="guessRows">number of rows used to guess types</param>
118
188
/// <param name="addIndexColumn">add one column with the row index</param>
119
- /// <returns>DataFrame</returns>
189
+ /// <returns><see cref=" DataFrame"/> </returns>
120
190
public static DataFrame LoadCsv ( Stream csvStream ,
121
191
char separator = ',' , bool header = true ,
122
192
string [ ] columnNames = null , Type [ ] dataTypes = null ,
@@ -127,7 +197,7 @@ public static DataFrame LoadCsv(Stream csvStream,
127
197
128
198
var linesForGuessType = new List < string [ ] > ( ) ;
129
199
long rowline = 0 ;
130
- int numberOfColumns = 0 ;
200
+ int numberOfColumns = dataTypes ? . Length ?? 0 ;
131
201
132
202
if ( header == true && numberOfRowsToRead != - 1 )
133
203
numberOfRowsToRead ++ ;
@@ -137,60 +207,52 @@ public static DataFrame LoadCsv(Stream csvStream,
137
207
// First pass: schema and number of rows.
138
208
using ( var streamReader = new StreamReader ( csvStream , encoding : null , detectEncodingFromByteOrderMarks : true , bufferSize : - 1 , leaveOpen : true ) )
139
209
{
140
- string line = streamReader . ReadLine ( ) ;
141
- while ( line ! = null )
210
+ string line = null ;
211
+ if ( dataTypes = = null )
142
212
{
143
- if ( ( numberOfRowsToRead == - 1 ) || rowline < numberOfRowsToRead )
213
+ line = streamReader . ReadLine ( ) ;
214
+ while ( line != null )
144
215
{
145
- if ( linesForGuessType . Count < guessRows )
216
+ if ( ( numberOfRowsToRead == - 1 ) || rowline < numberOfRowsToRead )
146
217
{
147
- var spl = line . Split ( separator ) ;
148
- if ( header && rowline == 0 )
218
+ if ( linesForGuessType . Count < guessRows )
149
219
{
150
- if ( columnNames == null )
151
- columnNames = spl ;
152
- }
153
- else
154
- {
155
- linesForGuessType . Add ( spl ) ;
156
- numberOfColumns = Math . Max ( numberOfColumns , spl . Length ) ;
220
+ var spl = line . Split ( separator ) ;
221
+ if ( header && rowline == 0 )
222
+ {
223
+ if ( columnNames == null )
224
+ columnNames = spl ;
225
+ }
226
+ else
227
+ {
228
+ linesForGuessType . Add ( spl ) ;
229
+ numberOfColumns = Math . Max ( numberOfColumns , spl . Length ) ;
230
+ }
157
231
}
158
232
}
233
+ ++ rowline ;
234
+ if ( rowline == guessRows )
235
+ {
236
+ break ;
237
+ }
238
+ line = streamReader . ReadLine ( ) ;
159
239
}
160
- ++ rowline ;
161
- if ( rowline == numberOfRowsToRead )
162
- break ;
163
- line = streamReader . ReadLine ( ) ;
164
- }
165
240
166
- if ( linesForGuessType . Count == 0 )
167
- throw new FormatException ( Strings . EmptyFile ) ;
241
+ if ( linesForGuessType . Count == 0 )
242
+ {
243
+ throw new FormatException ( Strings . EmptyFile ) ;
244
+ }
245
+ }
168
246
169
247
columns = new List < DataFrameColumn > ( numberOfColumns ) ;
170
-
171
- // Guesses types and adds columns.
248
+ // Guesses types or looks up dataTypes and adds columns.
172
249
for ( int i = 0 ; i < numberOfColumns ; ++ i )
173
250
{
174
- Type kind = GuessKind ( i , linesForGuessType ) ;
175
- if ( kind == typeof ( bool ) )
176
- {
177
- DataFrameColumn boolColumn = new PrimitiveDataFrameColumn < bool > ( columnNames == null ? "Column" + i . ToString ( ) : columnNames [ i ] , header == true ? rowline - 1 : rowline ) ;
178
- columns . Add ( boolColumn ) ;
179
- }
180
- else if ( kind == typeof ( float ) )
181
- {
182
- DataFrameColumn floatColumn = new PrimitiveDataFrameColumn < float > ( columnNames == null ? "Column" + i . ToString ( ) : columnNames [ i ] , header == true ? rowline - 1 : rowline ) ;
183
- columns . Add ( floatColumn ) ;
184
- }
185
- else if ( kind == typeof ( string ) )
186
- {
187
- DataFrameColumn stringColumn = new StringDataFrameColumn ( columnNames == null ? "Column" + i . ToString ( ) : columnNames [ i ] , header == true ? rowline - 1 : rowline ) ;
188
- columns . Add ( stringColumn ) ;
189
- }
190
- else
191
- throw new NotSupportedException ( nameof ( kind ) ) ;
251
+ Type kind = dataTypes == null ? GuessKind ( i , linesForGuessType ) : dataTypes [ i ] ;
252
+ columns . Add ( CreateColumn ( kind , columnNames , i ) ) ;
192
253
}
193
254
255
+ DataFrame ret = new DataFrame ( columns ) ;
194
256
line = null ;
195
257
streamReader . DiscardBufferedData ( ) ;
196
258
streamReader . BaseStream . Seek ( streamStart , SeekOrigin . Begin ) ;
@@ -207,7 +269,7 @@ public static DataFrame LoadCsv(Stream csvStream,
207
269
}
208
270
else
209
271
{
210
- AppendRow ( columns , header == true ? rowline - 1 : rowline , spl ) ;
272
+ ret . Append ( spl ) ;
211
273
}
212
274
++ rowline ;
213
275
line = streamReader . ReadLine ( ) ;
@@ -222,61 +284,7 @@ public static DataFrame LoadCsv(Stream csvStream,
222
284
}
223
285
columns . Insert ( 0 , indexColumn ) ;
224
286
}
225
- }
226
- return new DataFrame ( columns ) ;
227
- }
228
-
229
- private static void AppendRow ( List < DataFrameColumn > columns , long rowIndex , string [ ] values )
230
- {
231
- for ( int i = 0 ; i < columns . Count ; i ++ )
232
- {
233
- DataFrameColumn column = columns [ i ] ;
234
- string val = values [ i ] ;
235
- Type dType = column . DataType ;
236
- if ( dType == typeof ( bool ) )
237
- {
238
- bool boolParse = bool . TryParse ( val , out bool boolResult ) ;
239
- if ( boolParse )
240
- {
241
- column [ rowIndex ] = boolResult ;
242
- continue ;
243
- }
244
- else
245
- {
246
- if ( string . IsNullOrEmpty ( val ) )
247
- {
248
- column [ rowIndex ] = null ;
249
- continue ;
250
- }
251
- throw new ArgumentException ( string . Format ( Strings . MismatchedValueType , typeof ( bool ) ) , nameof ( val ) ) ;
252
- }
253
- }
254
- else if ( dType == typeof ( float ) )
255
- {
256
- bool floatParse = float . TryParse ( val , out float floatResult ) ;
257
- if ( floatParse )
258
- {
259
- column [ rowIndex ] = floatResult ;
260
- continue ;
261
- }
262
- else
263
- {
264
- if ( string . IsNullOrEmpty ( val ) )
265
- {
266
- column [ rowIndex ] = null ;
267
- continue ;
268
- }
269
- throw new ArgumentException ( string . Format ( Strings . MismatchedValueType , typeof ( float ) ) , nameof ( val ) ) ;
270
- }
271
- }
272
- else if ( dType == typeof ( string ) )
273
- {
274
- column [ rowIndex ] = values [ i ] ;
275
- }
276
- else
277
- {
278
- throw new NotImplementedException ( ) ;
279
- }
287
+ return ret ;
280
288
}
281
289
}
282
290
}
0 commit comments