From 742a92eafc901d00f07b752571296bff6dbf388d Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Wed, 10 Mar 2021 10:49:23 -0800 Subject: [PATCH 01/10] Part 2 of TextFieldParser. Next up is hooking up ReadCsv to use TextFieldParser --- .../TextFieldParser.cs | 545 +++++++++++++++++- .../TextFieldParserTests.cs | 197 +++++++ 2 files changed, 741 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.Data.Analysis/TextFieldParser.cs b/src/Microsoft.Data.Analysis/TextFieldParser.cs index 6cdada1aff..d3e53ed930 100644 --- a/src/Microsoft.Data.Analysis/TextFieldParser.cs +++ b/src/Microsoft.Data.Analysis/TextFieldParser.cs @@ -6,6 +6,7 @@ using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; +using System.Globalization; using System.IO; using System.Runtime.CompilerServices; using System.Text; @@ -19,6 +20,102 @@ internal enum FieldType FixedWidth } + internal class QuoteDelimitedFieldBuilder + { + private StringBuilder _field; + private bool _fieldFinished; + private int _index; + private int _delimiterLength; + private Regex _delimiterRegex; + private string _spaceChars; + private bool _malformedLine; + + public QuoteDelimitedFieldBuilder(Regex delimiterRegex, string spaceChars) + { + _delimiterRegex = delimiterRegex; + _spaceChars = spaceChars; + _field = new StringBuilder(); + } + + public bool FieldFinished => _fieldFinished; + + public string Field => _field.ToString(); + + public int Index => _index; + + public int DelimiterLength => _delimiterLength; + + public bool MalformedLine => _malformedLine; + + public void BuildField(string line, int startAt) + { + _index = startAt; + int length = line.Length; + + while (_index < length) + { + if (line[_index] == '"') + { + // Are we at the end of a file? + if (_index + 1 == length) + { + // We've found the end of the field + _fieldFinished = true; + _delimiterLength = 1; + + // Move index past end of line + _index++; + return; + } + // Check to see if this is an escaped quote + if (_index + 1 < line.Length && line[_index + 1] == '"') + { + _field.Append('"'); + _index += 2; + continue; + } + + // Find the next delimiter and make sure everything between the quote and delimiter is ignorable + int Limit; + Match delimiterMatch = _delimiterRegex.Match(line, _index + 1); + if (!delimiterMatch.Success) + { + Limit = length - 1; + } + else + { + Limit = delimiterMatch.Index - 1; + } + + for (int i = _index + 1; i < Limit; i++) + { + if (_spaceChars.IndexOf(line[i]) < 0) + { + _malformedLine = true; + return; + } + } + + // The length of the delimiter is the length of the closing quote (1) + any spaces + the length of the delimiter we matched if any + _delimiterLength = 1 + Limit - _index; + if (delimiterMatch.Success) + { + _delimiterLength += delimiterMatch.Length; + } + + _fieldFinished = true; + return; + } + else + { + _field.Append(line[_index]); + _index += 1; + } + } + } + } + + internal class TextFieldParser : IDisposable { private delegate int ChangeBufferFunction(); @@ -47,7 +144,13 @@ internal class TextFieldParser : IDisposable private string[] _delimitersCopy; - private Regex _whiteSpaceRegEx = new Regex("\\s", RegexOptions.CultureInvariant); + private Regex _delimiterRegex; + + private Regex _delimiterWithEndCharsRegex; + + private int[] _whitespaceCodes = new int[] { '\u0020' }; + + private Regex _beginQuotesRegex; private bool _trimWhiteSpace = true; @@ -57,12 +160,20 @@ internal class TextFieldParser : IDisposable private int _charsRead; + private bool _needPropertyCheck = true; + private const int DEFAULT_BUFFER_LENGTH = 4096; private char[] _buffer = new char[DEFAULT_BUFFER_LENGTH]; private bool _hasFieldsEnclosedInQuotes = true; + private int _lineLength; + + private string _spaceChars; + + private int _maxLineSize = 10000000; + private int _maxBufferSize = 10000000; private bool _leaveOpen; @@ -76,6 +187,7 @@ public string[] CommentTokens { CheckCommentTokensForWhitespace(value); _commentTokens = value; + _needPropertyCheck = true; } } @@ -125,6 +237,7 @@ public FieldType TextFieldType { ValidateFieldTypeEnumValue(value, "value"); _textFieldType = value; + _needPropertyCheck = true; } } @@ -143,6 +256,7 @@ private set _fieldWidthsCopy = null; } _fieldWidths = value; + _needPropertyCheck = true; } } @@ -161,6 +275,8 @@ private set _delimitersCopy = null; } _delimiters = value; + _needPropertyCheck = true; + _beginQuotesRegex = null; } } @@ -182,6 +298,59 @@ public bool HasFieldsEnclosedInQuotes } } + private Regex BeginQuotesRegex + { + get + { + if (_beginQuotesRegex == null) + { + string pattern = string.Format(CultureInfo.InvariantCulture, "\\G[{0}]*\"", WhitespacePattern); + _beginQuotesRegex = new Regex(pattern, RegexOptions.CultureInvariant); + } + return _beginQuotesRegex; + } + } + + private string EndQuotePattern => string.Format(CultureInfo.InvariantCulture, "\"[{0}]*", WhitespacePattern); + + private string WhitespaceCharacters + { + get + { + StringBuilder builder = new StringBuilder(); + int[] whitespaceCodes = _whitespaceCodes; + foreach (int code in whitespaceCodes) + { + char spaceChar = (char)code; + if (!CharacterIsInDelimiter(spaceChar)) + { + builder.Append(spaceChar); + } + } + return builder.ToString(); + } + } + + // What's the difference between this and WhitespaceCharacters in how they are used? + private string WhitespacePattern + { + get + { + StringBuilder builder = new StringBuilder(); + int[] whitespaceCodes = _whitespaceCodes; + for (int i = 0; i < whitespaceCodes.Length; i++) + { + int code = whitespaceCodes[i]; + char spaceChar = (char)code; + if (!CharacterIsInDelimiter(spaceChar)) + { + builder.Append("\\u" + code.ToString("X4", CultureInfo.InvariantCulture)); + } + } + return builder.ToString(); + } + } + public TextFieldParser(string path) { InitializeFromPath(path, Encoding.ASCII, detectEncoding: true); @@ -254,6 +423,25 @@ public string ReadLine() return line.TrimEnd(newLineChars); } + public string[] ReadFields() + { + if ((_reader == null) | (_buffer == null)) + { + return null; + } + ValidateReadyToRead(); + switch (_textFieldType) + { + case FieldType.FixedWidth: + return ParseFixedWidthLine(); + case FieldType.Delimited: + return ParseDelimitedLine(); + default: + Debug.Fail("The TextFieldType is not supported"); + return null; + } + } + /// /// Peek at characters of the next data line without reading the line /// @@ -355,6 +543,8 @@ private void FinishReading() _lineNumber = -1L; _endOfData = true; _buffer = null; + _delimiterRegex = null; + _beginQuotesRegex = null; } private void InitializeFromPath(string path, Encoding defaultEncoding, bool detectEncoding) @@ -487,6 +677,23 @@ private int IncreaseBufferSize() return charsRead; } + private string ReadNextDataLine() + { + ChangeBufferFunction BufferFunction = ReadToBuffer; + string line; + do + { + line = ReadNextLine(ref _position, BufferFunction); + _lineNumber++; + } + while (IgnoreLine(line)); + if (line == null) + { + CloseReader(); + } + return line; + } + private string PeekNextDataLine() { ChangeBufferFunction BufferFunction = IncreaseBufferSize; @@ -562,6 +769,205 @@ private string ReadNextLine(ref int cursor, ChangeBufferFunction changeBuffer) return Builder.ToString(); } + private string[] ParseDelimitedLine() + { + string line = ReadNextDataLine(); + if (line == null) + { + return null; + } + long currentLineNumber = _lineNumber - 1; + int index = 0; + List Fields = new List(); + int lineEndIndex = GetEndOfLineIndex(line); + while (index <= lineEndIndex) + { + Match matchResult = null; + bool quoteDelimited = false; + if (HasFieldsEnclosedInQuotes) + { + matchResult = BeginQuotesRegex.Match(line, index); + quoteDelimited = matchResult.Success; + } + string field; + if (quoteDelimited) + { + // Move the Index beyond quote + index = matchResult.Index + matchResult.Length; + + // Looking for the closing quote + QuoteDelimitedFieldBuilder endHelper = new QuoteDelimitedFieldBuilder(_delimiterWithEndCharsRegex, _spaceChars); + endHelper.BuildField(line, index); + if (endHelper.MalformedLine) + { + _errorLine = line.TrimEnd(newLineChars); + _errorLineNumber = currentLineNumber; + throw new Exception($"Line {currentLineNumber} cannot be parsed with the current Delimiters"); + } + if (endHelper.FieldFinished) + { + field = endHelper.Field; + index = endHelper.Index + endHelper.DelimiterLength; + } + else + { + // We may have an embedded line end character, so grab next line + do + { + int endOfLine = line.Length; + string newLine = ReadNextDataLine(); + if (newLine == null) + { + _errorLine = line.TrimEnd(newLineChars); + _errorLineNumber = currentLineNumber; + throw new Exception($"Line {currentLineNumber} cannot be parsed with the current Delimiters"); + } + if (line.Length + newLine.Length > _maxLineSize) + { + _errorLine = line.TrimEnd(newLineChars); + _errorLineNumber = currentLineNumber; + throw new Exception($"Line {currentLineNumber} cannot be read because it exceeds the max line size"); + } + line += newLine; + lineEndIndex = GetEndOfLineIndex(line); + endHelper.BuildField(line, endOfLine); + if (endHelper.MalformedLine) + { + _errorLine = line.TrimEnd(newLineChars); + _errorLineNumber = currentLineNumber; + throw new Exception($"Line {currentLineNumber} cannot be parsed with the current Delimiters"); + } + } + while (!endHelper.FieldFinished); + field = endHelper.Field; + index = endHelper.Index + endHelper.DelimiterLength; + } + if (_trimWhiteSpace) + { + field = field.Trim(); + } + Fields.Add(field); + continue; + } + + // Find the next delimiter + Match delimiterMatch = _delimiterRegex.Match(line, index); + if (delimiterMatch.Success) + { + field = line.Substring(index, delimiterMatch.Index - index); + if (_trimWhiteSpace) + { + field = field.Trim(); + } + Fields.Add(field); + index = delimiterMatch.Index + delimiterMatch.Length; + continue; + } + field = line.Substring(index).TrimEnd(newLineChars); + if (_trimWhiteSpace) + { + field = field.Trim(); + } + Fields.Add(field); + break; + } + return Fields.ToArray(); + } + + private string[] ParseFixedWidthLine() + { + Debug.Assert(_fieldWidths != null, "No field widths"); + string line = ReadNextDataLine(); + if (line == null) + { + return null; + } + line = line.TrimEnd(newLineChars); + StringInfo lineInfo = new StringInfo(line); + ValidateFixedWidthLine(lineInfo, _lineNumber - 1); + int index = 0; + int bound = _fieldWidths.Length - 1; + string[] Fields = new string[bound + 1]; + for (int i = 0; i <= bound; i++) + { + Fields[i] = GetFixedWidthField(lineInfo, index, _fieldWidths[i]); + index += _fieldWidths[i]; + } + return Fields; + } + + private string GetFixedWidthField(StringInfo line, int index, int fieldLength) + { + string field = ((fieldLength > 0) ? line.SubstringByTextElements(index, fieldLength) : ((index < line.LengthInTextElements) ? line.SubstringByTextElements(index).TrimEnd(newLineChars) : string.Empty)); + if (_trimWhiteSpace) + { + return field.Trim(); + } + return field; + } + + private int GetEndOfLineIndex(string line) + { + Debug.Assert(line != null, "We are parsing null"); + int length = line.Length; + Debug.Assert(length > 0, "A blank line shouldn't be parsed"); + if (length == 1) + { + Debug.Assert(!line[0].Equals('\r') & !line[0].Equals('\n'), "A blank line shouldn't be parsed"); + return length; + } + checked + { + if (line[length - 2].Equals('\r') | line[length - 2].Equals('\n')) + { + return length - 2; + } + if (line[length - 1].Equals('\r') | line[length - 1].Equals('\n')) + { + return length - 1; + } + return length; + } + } + + private void ValidateFixedWidthLine(StringInfo line, long lineNumber) + { + Debug.Assert(line != null, "No Line sent"); + if (line.LengthInTextElements < _lineLength) + { + _errorLine = line.String; + _errorLineNumber = checked(_lineNumber - 1); + throw new Exception($"Line {lineNumber} cannot be parsed with the current FieldWidths"); + } + } + + private void ValidateFieldWidths() + { + if (_fieldWidths == null) + { + throw new InvalidOperationException("m_FieldWidths is null"); + } + if (_fieldWidths.Length == 0) + { + throw new InvalidOperationException("m_FieldWidths is empty"); + } + checked + { + int widthBound = _fieldWidths.Length - 1; + _lineLength = 0; + int num = widthBound - 1; + for (int i = 0; i <= num; i++) + { + Debug.Assert(_fieldWidths[i] > 0, "Bad field width, this should have been caught on input"); + _lineLength += _fieldWidths[i]; + } + if (_fieldWidths[widthBound] > 0) + { + _lineLength += _fieldWidths[widthBound]; + } + } + } + private void ValidateFieldWidthsOnInput(int[] widths) { Debug.Assert(widths != null, "There are no field widths"); @@ -575,6 +981,76 @@ private void ValidateFieldWidthsOnInput(int[] widths) } } + private void ValidateAndEscapeDelimiters() + { + if (_delimiters == null) + { + throw new Exception("m_Delimiters is null"); + } + if (_delimiters.Length == 0) + { + throw new Exception("m_Delimiters is empty"); + } + int length = _delimiters.Length; + StringBuilder builder = new StringBuilder(); + StringBuilder quoteBuilder = new StringBuilder(); + quoteBuilder.Append(EndQuotePattern + "("); + for (int i = 0; i <= length - 1; i++) + { + if (_delimiters[i] != null) + { + if (_hasFieldsEnclosedInQuotes && _delimiters[i].IndexOf('"') > -1) + { + throw new Exception("A double quote is not a legal delimiter when HasFieldsEnclosedInQuotes is set to True."); + } + string EscapedDelimiter = Regex.Escape(_delimiters[i]); + builder.Append(EscapedDelimiter + "|"); + quoteBuilder.Append(EscapedDelimiter + "|"); + } + else + { + Debug.Fail("Delimiter element is empty. This should have been caught on input"); + } + } + _spaceChars = WhitespaceCharacters; + _delimiterRegex = new Regex(builder.ToString(0, builder.Length - 1), (RegexOptions)512); + builder.Append("\r|\n"); + _delimiterWithEndCharsRegex = new Regex(builder.ToString(), (RegexOptions)512); + quoteBuilder.Append("\r|\n)|\"$"); + } + + private void ValidateReadyToRead() + { + if (!(_needPropertyCheck | ArrayHasChanged())) + { + return; + } + switch (_textFieldType) + { + case FieldType.Delimited: + ValidateAndEscapeDelimiters(); + break; + case FieldType.FixedWidth: + ValidateFieldWidths(); + break; + default: + Debug.Fail("Unknown TextFieldType"); + break; + } + if (_commentTokens != null) + { + string[] commentTokens = _commentTokens; + foreach (string Token in commentTokens) + { + if (Token != string.Empty && (_hasFieldsEnclosedInQuotes & (_textFieldType == FieldType.Delimited)) && string.Compare(Token.Trim(), "\"", StringComparison.Ordinal) == 0) + { + throw new Exception("A double quote is not a legal comment token when HasFieldsEnclosedInQuotes is set to True."); + } + } + } + _needPropertyCheck = false; + } + private void ValidateDelimiters(string[] delimiterArray) { if (delimiterArray == null) @@ -594,6 +1070,59 @@ private void ValidateDelimiters(string[] delimiterArray) } } + private bool ArrayHasChanged() + { + int lowerBound = 0; + int upperBound = 0; + switch (_textFieldType) + { + case FieldType.Delimited: + { + Debug.Assert(((_delimitersCopy == null) & (_delimiters == null)) | ((_delimitersCopy != null) & (_delimiters != null)), "Delimiters and copy are not both Nothing or both not Nothing"); + if (_delimiters == null) + { + return false; + } + lowerBound = _delimitersCopy.GetLowerBound(0); + upperBound = _delimitersCopy.GetUpperBound(0); + int num3 = lowerBound; + int num4 = upperBound; + for (int i = num3; i <= num4; i++) + { + if (_delimiters[i] != _delimitersCopy[i]) + { + return true; + } + } + break; + } + case FieldType.FixedWidth: + { + Debug.Assert(((_fieldWidthsCopy == null) & (_fieldWidths == null)) | ((_fieldWidthsCopy != null) & (_fieldWidths != null)), "FieldWidths and copy are not both Nothing or both not Nothing"); + if (_fieldWidths == null) + { + return false; + } + lowerBound = _fieldWidthsCopy.GetLowerBound(0); + upperBound = _fieldWidthsCopy.GetUpperBound(0); + int num = lowerBound; + int num2 = upperBound; + for (int j = num; j <= num2; j++) + { + if (_fieldWidths[j] != _fieldWidthsCopy[j]) + { + return true; + } + } + break; + } + default: + Debug.Fail("Unknown TextFieldType"); + break; + } + return false; + } + private void CheckCommentTokensForWhitespace(string[] tokens) { if (tokens == null) @@ -608,5 +1137,19 @@ private void CheckCommentTokensForWhitespace(string[] tokens) } } } + + private bool CharacterIsInDelimiter(char testCharacter) + { + Debug.Assert(_delimiters != null, "No delimiters set!"); + string[] delimiters = _delimiters; + foreach (string delimiter in delimiters) + { + if (delimiter.IndexOf(testCharacter) > -1) + { + return true; + } + } + return false; + } } } diff --git a/test/Microsoft.Data.Analysis.Tests/TextFieldParserTests.cs b/test/Microsoft.Data.Analysis.Tests/TextFieldParserTests.cs index 96ac2d4787..c7a0d5335f 100644 --- a/test/Microsoft.Data.Analysis.Tests/TextFieldParserTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/TextFieldParserTests.cs @@ -164,5 +164,202 @@ public void ReadLine_ReadToEnd() Assert.True(parser.EndOfData); } } + + [Fact] + public void ErrorLine() + { + string data = @"abc 123 +def 45 +ghi 789"; + + using (var parser = new TextFieldParser(GetStream(data))) + { + parser.TextFieldType = FieldType.FixedWidth; + parser.SetFieldWidths(new[] { 3, 4 }); + + Assert.Equal(-1, parser.ErrorLineNumber); + Assert.Equal("", parser.ErrorLine); + + Assert.Equal(new[] { "abc", "123" }, parser.ReadFields()); + Assert.Equal(-1, parser.ErrorLineNumber); + Assert.Equal("", parser.ErrorLine); + + Assert.Throws(() => parser.ReadFields()); + Assert.Equal(2, parser.ErrorLineNumber); + Assert.Equal("def 45", parser.ErrorLine); + + Assert.Equal(new[] { "ghi", "789" }, parser.ReadFields()); + Assert.Equal(2, parser.ErrorLineNumber); + Assert.Equal("def 45", parser.ErrorLine); + } + } + + [Fact] + public void HasFieldsEnclosedInQuotes_TrimWhiteSpace() + { + string data = @""""", "" "" ,""abc"", "" 123 "" ,"; + + using (var parser = new TextFieldParser(GetStream(data))) + { + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { "", "", "abc", "123", "" }, parser.ReadFields()); + } + + using (var parser = new TextFieldParser(GetStream(data))) + { + parser.TrimWhiteSpace = false; + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { "", " ", "abc", " 123 ", "" }, parser.ReadFields()); + } + + using (var parser = new TextFieldParser(GetStream(data))) + { + parser.HasFieldsEnclosedInQuotes = false; + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { @"""""", @""" """, @"""abc""", @""" 123 """, "" }, parser.ReadFields()); + } + + using (var parser = new TextFieldParser(GetStream(data))) + { + parser.TrimWhiteSpace = false; + parser.HasFieldsEnclosedInQuotes = false; + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { @"""""", @" "" "" ", @"""abc""", @" "" 123 "" ", "" }, parser.ReadFields()); + } + + data = @""","", "", """; + using (var parser = new TextFieldParser(GetStream(data))) + { + parser.TrimWhiteSpace = false; + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { ",", ", " }, parser.ReadFields()); + } + } + + [Fact] + public void PeekChars() + { + string data = @"abc,123 +def,456 +ghi,789"; + + using (var parser = new TextFieldParser(GetStream(data))) + { + Assert.Throws(() => parser.PeekChars(0)); + + Assert.Equal("a", parser.PeekChars(1)); + Assert.Equal("abc,123", parser.PeekChars(10)); + + Assert.Equal("abc,123", parser.ReadLine()); + + parser.TextFieldType = FieldType.FixedWidth; + parser.SetFieldWidths(new[] { 3, -1 }); + + Assert.Equal("d", parser.PeekChars(1)); + Assert.Equal("def,456", parser.PeekChars(10)); + Assert.Equal(new[] { "def", ",456" }, parser.ReadFields()); + + parser.TextFieldType = FieldType.Delimited; + parser.SetDelimiters(new[] { "," }); + + Assert.Equal("g", parser.PeekChars(1)); + Assert.Equal("ghi,789", parser.PeekChars(10)); + Assert.Equal(new[] { "ghi", "789" }, parser.ReadFields()); + + Assert.Null(parser.PeekChars(1)); + Assert.Null(parser.PeekChars(10)); + } + } + + [Fact] + public void ReadFields_FieldWidths() + { + string data = @"abc,123 +def,456 +ghi,789"; + + using (var parser = new TextFieldParser(GetStream(data))) + { + parser.TextFieldType = FieldType.FixedWidth; + + Assert.Throws(() => parser.ReadFields()); + + parser.SetFieldWidths(new[] { -1 }); + Assert.Equal(new[] { "abc,123" }, parser.ReadFields()); + + parser.SetFieldWidths(new[] { 3, -1 }); + Assert.Equal(new[] { "def", ",456" }, parser.ReadFields()); + + parser.SetFieldWidths(new[] { 3, 2 }); + Assert.Equal(new[] { "ghi", ",7" }, parser.ReadFields()); + + parser.SetFieldWidths(new[] { 3, 2 }); + Assert.Null(parser.ReadFields()); + } + } + + [Fact] + public void ReadFields_Delimiters_LineNumber() + { + string data = @"abc,123 +def,456 +ghi,789"; + + using (var parser = new TextFieldParser(GetStream(data))) + { + Assert.Equal(1, parser.LineNumber); + + Assert.Throws(() => parser.ReadFields()); + Assert.Equal(1, parser.LineNumber); + + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { "abc","123" }, parser.ReadFields()); + Assert.Equal(2, parser.LineNumber); + + parser.SetDelimiters(new[] { ";", "," }); + Assert.Equal(new[] { "def", "456" }, parser.ReadFields()); + Assert.Equal(3, parser.LineNumber); + + parser.SetDelimiters(new[] { "g", "9" }); + Assert.Equal(new[] { "", "hi,78", "" }, parser.ReadFields()); + Assert.Equal(-1, parser.LineNumber); + } + + data = @",, + +, +"; + + using (var parser = new TextFieldParser(GetStream(data))) + { + Assert.Equal(1, parser.LineNumber); + + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { "", "", "" }, parser.ReadFields()); + Assert.Equal(2, parser.LineNumber); + + // ReadFields should ignore the empty new line + Assert.Equal(new[] { "", "" }, parser.ReadFields()); + Assert.Equal(-1, parser.LineNumber); + + Assert.Null(parser.ReadFields()); + Assert.Equal(-1, parser.LineNumber); + + Assert.Null(parser.ReadFields()); + Assert.Equal(-1, parser.LineNumber); + } + } + + [Fact] + public void UnmatchedQuote_MalformedLineException() + { + string data = @""""", """; + + using (var parser = new TextFieldParser(GetStream(data))) + { + parser.SetDelimiters(new[] { "," }); + Assert.Throws(() => parser.ReadFields()); + } + } } } From 47b434bb36e1c3899782758461eaa365f3b5aa9b Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Thu, 11 Mar 2021 15:51:42 -0800 Subject: [PATCH 02/10] Make LoadCsv use TextFieldParser --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 107 +++++++++--------- .../DataFrame.IOTests.cs | 42 ++++--- 2 files changed, 78 insertions(+), 71 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index eff4c3a90b..f09595dd8d 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -172,17 +172,21 @@ private static DataFrameColumn CreateColumn(Type kind, string[] columnNames, int return ret; } - private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable lines, + private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, - long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false - ) + long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, + Encoding encoding = null) { if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } + TextReader textReader = wrappedReader.TextReader; + TextFieldParser parser = new TextFieldParser(textReader); + parser.SetDelimiters(separator.ToString()); + var linesForGuessType = new List(); long rowline = 0; int numberOfColumns = dataTypes?.Length ?? 0; @@ -194,17 +198,15 @@ private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable lines, List columns; // First pass: schema and number of rows. - string line = null; - var enumerator = lines.GetEnumerator(); - while (enumerator.MoveNext()) + string[] fields; + while ((fields = parser.ReadFields()) != null) { - line = enumerator.Current; if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { if (linesForGuessType.Count < guessRows || (header && rowline == 0)) { - var spl = line.Split(separator); + string[] spl = fields; if (header && rowline == 0) { if (columnNames == null) @@ -240,15 +242,16 @@ private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable lines, } DataFrame ret = new DataFrame(columns); - line = null; // Fill values. - enumerator.Reset(); + textReader = wrappedReader.TextReader; + parser = new TextFieldParser(textReader); + parser.SetDelimiters(separator.ToString()); + rowline = 0; - while (enumerator.MoveNext() && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) + while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { - line = enumerator.Current; - var spl = line.Split(separator); + string[] spl = fields; if (header && rowline == 0) { // Skips. @@ -269,54 +272,54 @@ private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable lines, } columns.Insert(0, indexColumn); } + + textReader.Dispose(); return ret; } - private class CsvLines : IEnumerable + private class WrappedStreamReaderOrStringReader { - private CsvLineEnumerator enumerator; - public CsvLines(CsvLineEnumerator csvLineEnumerator) + private Stream _stream; + private long _initialPosition; + private Encoding _encoding; + private string _csvString; + + public WrappedStreamReaderOrStringReader(Stream stream, Encoding encoding) { - enumerator = csvLineEnumerator; + _stream = stream; + _initialPosition = stream.Position; + _encoding = encoding; + _csvString = null; } - public IEnumerator GetEnumerator() => enumerator; - - IEnumerator IEnumerable.GetEnumerator() => enumerator; - } - - private class CsvLineEnumerator : IEnumerator - { - private StreamReader streamReader; - private string currentLine; - private long streamStartPosition; - public CsvLineEnumerator(StreamReader csvStream) + public WrappedStreamReaderOrStringReader(string csvString) { - streamStartPosition = csvStream.BaseStream.Position; - streamReader = csvStream; - currentLine = null; + _csvString = csvString; + _initialPosition = 0; + _encoding = null; + _stream = null; } - public string Current => currentLine; - - object IEnumerator.Current => currentLine; + public long InitialPosition => _initialPosition; - public void Dispose() + // Returns a new TextReader. If the wrapped object is a stream, the stream is reset to its initial position. + public TextReader TextReader { - throw new NotImplementedException(); - } + get + { + if (_stream != null) + { + _stream.Seek(_initialPosition, SeekOrigin.Begin); + return new StreamReader(_stream, _encoding, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true); + } + else + { + return new StringReader(_csvString); + } + } - public bool MoveNext() - { - currentLine = streamReader.ReadLine(); - return currentLine != null; } - public void Reset() - { - streamReader.DiscardBufferedData(); - streamReader.BaseStream.Seek(streamStartPosition, SeekOrigin.Begin); - } } /// @@ -336,8 +339,8 @@ public static DataFrame LoadCsvFromString(string csvString, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false) { - string[] lines = csvString.Split(new[] { Environment.NewLine }, StringSplitOptions.None); - return ReadCsvLinesIntoDataFrame(lines, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn); + WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString); + return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn); } /// @@ -369,12 +372,8 @@ public static DataFrame LoadCsv(Stream csvStream, throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } - using (var streamReader = new StreamReader(csvStream, encoding ?? Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true)) - { - CsvLineEnumerator linesEnumerator = new CsvLineEnumerator(streamReader); - IEnumerable lines = new CsvLines(linesEnumerator); - return ReadCsvLinesIntoDataFrame(lines, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn); - } + WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8); + return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn); } /// diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 9e8ccb6903..4445fd2bd7 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -101,14 +101,18 @@ internal static void VerifyColumnTypes(DataFrame df, bool testArrowStringColumn } } - [Fact] - public void TestReadCsvWithHeader() + [Theory] + [InlineData(false)] + [InlineData(true)] + public void TestReadCsvWithHeader(bool useQuotes) { - string data = @"vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,fare_amount -CMT,1,1,1271,3.8,CRD,17.5 -CMT,1,1,474,1.5,CRD,8 -CMT,1,1,637,1.4,CRD,8.5 -CMT,1,1,181,0.6,CSH,4.5"; + string CMT = useQuotes ? @"""C,MT""" : "CMT"; + string verifyCMT = useQuotes ? "C,MT" : "CMT"; + string data = @$"vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,fare_amount +{CMT},1,1,1271,3.8,CRD,17.5 +{CMT},1,1,474,1.5,CRD,8 +{CMT},1,1,637,1.4,CRD,8.5 +{CMT},1,1,181,0.6,CSH,4.5"; Stream GetStream(string streamData) { @@ -118,7 +122,7 @@ void RegularTest(DataFrame df) { Assert.Equal(4, df.Rows.Count); Assert.Equal(7, df.Columns.Count); - Assert.Equal("CMT", df.Columns["vendor_id"][3]); + Assert.Equal(verifyCMT, df.Columns["vendor_id"][3]); VerifyColumnTypes(df); } DataFrame df = DataFrame.LoadCsv(GetStream(data)); @@ -130,7 +134,7 @@ void ReducedRowsTest(DataFrame reducedRows) { Assert.Equal(3, reducedRows.Rows.Count); Assert.Equal(7, reducedRows.Columns.Count); - Assert.Equal("CMT", reducedRows.Columns["vendor_id"][2]); + Assert.Equal(verifyCMT, reducedRows.Columns["vendor_id"][2]); VerifyColumnTypes(df); } DataFrame reducedRows = DataFrame.LoadCsv(GetStream(data), numberOfRowsToRead: 3); @@ -139,13 +143,17 @@ void ReducedRowsTest(DataFrame reducedRows) ReducedRowsTest(csvDf); } - [Fact] - public void TestReadCsvNoHeader() + [Theory] + [InlineData(false)] + [InlineData(true)] + public void TestReadCsvNoHeader(bool useQuotes) { - string data = @"CMT,1,1,1271,3.8,CRD,17.5 -CMT,1,1,474,1.5,CRD,8 -CMT,1,1,637,1.4,CRD,8.5 -CMT,1,1,181,0.6,CSH,4.5"; + string CMT = useQuotes ? @"""C,MT""" : "CMT"; + string verifyCMT = useQuotes ? "C,MT" : "CMT"; + string data = @$"{CMT},1,1,1271,3.8,CRD,17.5 +{CMT},1,1,474,1.5,CRD,8 +{CMT},1,1,637,1.4,CRD,8.5 +{CMT},1,1,181,0.6,CSH,4.5"; Stream GetStream(string streamData) { @@ -155,7 +163,7 @@ void RegularTest(DataFrame df) { Assert.Equal(4, df.Rows.Count); Assert.Equal(7, df.Columns.Count); - Assert.Equal("CMT", df.Columns["Column0"][3]); + Assert.Equal(verifyCMT, df.Columns["Column0"][3]); VerifyColumnTypes(df); } @@ -168,7 +176,7 @@ void ReducedRowsTest(DataFrame reducedRows) { Assert.Equal(3, reducedRows.Rows.Count); Assert.Equal(7, reducedRows.Columns.Count); - Assert.Equal("CMT", reducedRows.Columns["Column0"][2]); + Assert.Equal(verifyCMT, reducedRows.Columns["Column0"][2]); VerifyColumnTypes(df); } From 17f6d7f8946357031d7707467b345e6a42b05f6d Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 12 Mar 2021 11:50:21 -0800 Subject: [PATCH 03/10] More unit tests --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 4 +- .../TextFieldParser.cs | 65 ++- .../strings.Designer.cs | 135 ++++++ src/Microsoft.Data.Analysis/strings.resx | 45 ++ .../DataFrame.IOTests.cs | 43 ++ .../Microsoft.Data.Analysis.Tests.csproj | 20 + .../Strings.Designer.cs | 414 ++++++++++++++++++ .../Strings.resx | 237 ++++++++++ .../TextFieldParserTests.cs | 54 +++ 9 files changed, 982 insertions(+), 35 deletions(-) create mode 100644 test/Microsoft.Data.Analysis.Tests/Strings.Designer.cs create mode 100644 test/Microsoft.Data.Analysis.Tests/Strings.resx diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index f09595dd8d..7951d76405 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -175,8 +175,8 @@ private static DataFrameColumn CreateColumn(Type kind, string[] columnNames, int private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, - long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, - Encoding encoding = null) + long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false + ) { if (dataTypes == null && guessRows <= 0) { diff --git a/src/Microsoft.Data.Analysis/TextFieldParser.cs b/src/Microsoft.Data.Analysis/TextFieldParser.cs index d3e53ed930..7ecff015c4 100644 --- a/src/Microsoft.Data.Analysis/TextFieldParser.cs +++ b/src/Microsoft.Data.Analysis/TextFieldParser.cs @@ -232,7 +232,7 @@ public long LineNumber public FieldType TextFieldType { - get =>_textFieldType; + get => _textFieldType; set { ValidateFieldTypeEnumValue(value, "value"); @@ -243,7 +243,7 @@ public FieldType TextFieldType public int[] FieldWidths { - get =>_fieldWidths; + get => _fieldWidths; private set { if (value != null) @@ -282,7 +282,7 @@ private set public bool TrimWhiteSpace { - get =>_trimWhiteSpace; + get => _trimWhiteSpace; set { _trimWhiteSpace = value; @@ -291,7 +291,7 @@ public bool TrimWhiteSpace public bool HasFieldsEnclosedInQuotes { - get =>_hasFieldsEnclosedInQuotes; + get => _hasFieldsEnclosedInQuotes; set { _hasFieldsEnclosedInQuotes = value; @@ -331,7 +331,6 @@ private string WhitespaceCharacters } } - // What's the difference between this and WhitespaceCharacters in how they are used? private string WhitespacePattern { get @@ -451,7 +450,7 @@ public string PeekChars(int numberOfChars) { if (numberOfChars <= 0) { - throw new ArgumentException($"{nameof(numberOfChars)} must be greater than 0"); + throw new ArgumentException(string.Format(Strings.PositiveNumberOfCharacters, nameof(numberOfChars))); } if ((_reader == null) | (_buffer == null)) @@ -571,7 +570,7 @@ private void InitializeFromStream(Stream stream, Encoding defaultEncoding, bool } if (!stream.CanRead) { - throw new ArgumentException("stream can't read"); + throw new ArgumentException(Strings.StreamDoesntSupportReading); } if (defaultEncoding == null) { @@ -585,7 +584,7 @@ private string ValidatePath(string path) { if (!File.Exists(path)) { - throw new FileNotFoundException($"{path} not found."); + throw new FileNotFoundException(Strings.FileNotFound); } return path; } @@ -666,7 +665,7 @@ private int IncreaseBufferSize() int bufferSize = _buffer.Length + DEFAULT_BUFFER_LENGTH; if (bufferSize > _maxBufferSize) { - throw new Exception("Exceeded maximum buffer size"); + throw new Exception(Strings.ExceededMaxBufferSize); } char[] tempArray = new char[bufferSize]; Array.Copy(_buffer, tempArray, _buffer.Length); @@ -802,7 +801,7 @@ private string[] ParseDelimitedLine() { _errorLine = line.TrimEnd(newLineChars); _errorLineNumber = currentLineNumber; - throw new Exception($"Line {currentLineNumber} cannot be parsed with the current Delimiters"); + throw new Exception(string.Format(Strings.CannotParseWithDelimiters, currentLineNumber)); } if (endHelper.FieldFinished) { @@ -820,13 +819,13 @@ private string[] ParseDelimitedLine() { _errorLine = line.TrimEnd(newLineChars); _errorLineNumber = currentLineNumber; - throw new Exception($"Line {currentLineNumber} cannot be parsed with the current Delimiters"); + throw new Exception(string.Format(Strings.CannotParseWithDelimiters, currentLineNumber)); } if (line.Length + newLine.Length > _maxLineSize) { _errorLine = line.TrimEnd(newLineChars); _errorLineNumber = currentLineNumber; - throw new Exception($"Line {currentLineNumber} cannot be read because it exceeds the max line size"); + throw new Exception(string.Format(Strings.LineExceedsMaxLineSize, currentLineNumber)); } line += newLine; lineEndIndex = GetEndOfLineIndex(line); @@ -835,7 +834,7 @@ private string[] ParseDelimitedLine() { _errorLine = line.TrimEnd(newLineChars); _errorLineNumber = currentLineNumber; - throw new Exception($"Line {currentLineNumber} cannot be parsed with the current Delimiters"); + throw new Exception(string.Format(Strings.CannotParseWithDelimiters, currentLineNumber)); } } while (!endHelper.FieldFinished); @@ -886,9 +885,9 @@ private string[] ParseFixedWidthLine() StringInfo lineInfo = new StringInfo(line); ValidateFixedWidthLine(lineInfo, _lineNumber - 1); int index = 0; - int bound = _fieldWidths.Length - 1; - string[] Fields = new string[bound + 1]; - for (int i = 0; i <= bound; i++) + int length = _fieldWidths.Length; + string[] Fields = new string[length]; + for (int i = 0; i < length; i++) { Fields[i] = GetFixedWidthField(lineInfo, index, _fieldWidths[i]); index += _fieldWidths[i]; @@ -898,7 +897,7 @@ private string[] ParseFixedWidthLine() private string GetFixedWidthField(StringInfo line, int index, int fieldLength) { - string field = ((fieldLength > 0) ? line.SubstringByTextElements(index, fieldLength) : ((index < line.LengthInTextElements) ? line.SubstringByTextElements(index).TrimEnd(newLineChars) : string.Empty)); + string field = (fieldLength > 0) ? line.SubstringByTextElements(index, fieldLength) : ((index < line.LengthInTextElements) ? line.SubstringByTextElements(index).TrimEnd(newLineChars) : string.Empty); if (_trimWhiteSpace) { return field.Trim(); @@ -937,7 +936,7 @@ private void ValidateFixedWidthLine(StringInfo line, long lineNumber) { _errorLine = line.String; _errorLineNumber = checked(_lineNumber - 1); - throw new Exception($"Line {lineNumber} cannot be parsed with the current FieldWidths"); + throw new Exception(string.Format(Strings.CannotParseWithFieldWidths, lineNumber)); } } @@ -945,11 +944,11 @@ private void ValidateFieldWidths() { if (_fieldWidths == null) { - throw new InvalidOperationException("m_FieldWidths is null"); + throw new InvalidOperationException(Strings.NullFieldWidths); } if (_fieldWidths.Length == 0) { - throw new InvalidOperationException("m_FieldWidths is empty"); + throw new InvalidOperationException(Strings.EmptyFieldWidths); } checked { @@ -976,7 +975,7 @@ private void ValidateFieldWidthsOnInput(int[] widths) { if (widths[i] < 1) { - throw new ArgumentException("All field widths, except the last element, must be greater than zero. A field width less than or equal to zero in the last element indicates the last field is of variable length."); + throw new ArgumentException(Strings.InvalidFieldWidths); } } } @@ -985,11 +984,11 @@ private void ValidateAndEscapeDelimiters() { if (_delimiters == null) { - throw new Exception("m_Delimiters is null"); + throw new Exception(Strings.NullDelimiters); } if (_delimiters.Length == 0) { - throw new Exception("m_Delimiters is empty"); + throw new Exception(Strings.EmptyDelimiters); } int length = _delimiters.Length; StringBuilder builder = new StringBuilder(); @@ -1001,11 +1000,11 @@ private void ValidateAndEscapeDelimiters() { if (_hasFieldsEnclosedInQuotes && _delimiters[i].IndexOf('"') > -1) { - throw new Exception("A double quote is not a legal delimiter when HasFieldsEnclosedInQuotes is set to True."); + throw new Exception(Strings.IllegalQuoteDelimiter); } - string EscapedDelimiter = Regex.Escape(_delimiters[i]); - builder.Append(EscapedDelimiter + "|"); - quoteBuilder.Append(EscapedDelimiter + "|"); + string escapedDelimiter = Regex.Escape(_delimiters[i]); + builder.Append(escapedDelimiter + "|"); + quoteBuilder.Append(escapedDelimiter + "|"); } else { @@ -1040,11 +1039,11 @@ private void ValidateReadyToRead() if (_commentTokens != null) { string[] commentTokens = _commentTokens; - foreach (string Token in commentTokens) + foreach (string token in commentTokens) { - if (Token != string.Empty && (_hasFieldsEnclosedInQuotes & (_textFieldType == FieldType.Delimited)) && string.Compare(Token.Trim(), "\"", StringComparison.Ordinal) == 0) + if (token != string.Empty && (_hasFieldsEnclosedInQuotes & (_textFieldType == FieldType.Delimited)) && string.Compare(token.Trim(), "\"", StringComparison.Ordinal) == 0) { - throw new Exception("A double quote is not a legal comment token when HasFieldsEnclosedInQuotes is set to True."); + throw new Exception(Strings.IllegalQuoteDelimiter); } } } @@ -1061,11 +1060,11 @@ private void ValidateDelimiters(string[] delimiterArray) { if (delimiter == string.Empty) { - throw new Exception("Delimiter cannot be empty"); + throw new Exception(Strings.EmptyDelimiters); } if (delimiter.IndexOfAny(newLineChars) > -1) { - throw new Exception("Delimiter cannot be new line characters"); + throw new Exception(Strings.DelimiterCannotBeNewlineChar); } } } @@ -1133,7 +1132,7 @@ private void CheckCommentTokensForWhitespace(string[] tokens) { if (token.Length == 1 && char.IsWhiteSpace(token[0])) { - throw new Exception("Comment token cannot contain whitespace"); + throw new Exception(Strings.CommentTokenCannotContainWhitespace); } } } diff --git a/src/Microsoft.Data.Analysis/strings.Designer.cs b/src/Microsoft.Data.Analysis/strings.Designer.cs index fc64940869..ff3cd6cadd 100644 --- a/src/Microsoft.Data.Analysis/strings.Designer.cs +++ b/src/Microsoft.Data.Analysis/strings.Designer.cs @@ -69,6 +69,24 @@ internal static string BadColumnCast { } } + /// + /// Looks up a localized string similar to Line {0} cannot be parsed with the current Delimiters.. + /// + internal static string CannotParseWithDelimiters { + get { + return ResourceManager.GetString("CannotParseWithDelimiters", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Line {0} cannot be parsed with the current FieldWidths.. + /// + internal static string CannotParseWithFieldWidths { + get { + return ResourceManager.GetString("CannotParseWithFieldWidths", resourceCulture); + } + } + /// /// Looks up a localized string similar to Cannot resize down. /// @@ -87,6 +105,15 @@ internal static string ColumnIndexOutOfRange { } } + /// + /// Looks up a localized string similar to Comment token cannot contain whitespace. + /// + internal static string CommentTokenCannotContainWhitespace { + get { + return ResourceManager.GetString("CommentTokenCannotContainWhitespace", resourceCulture); + } + } + /// /// Looks up a localized string similar to DataType. /// @@ -96,6 +123,15 @@ internal static string DataType { } } + /// + /// Looks up a localized string similar to Delimiter cannot be new line characters. + /// + internal static string DelimiterCannotBeNewlineChar { + get { + return ResourceManager.GetString("DelimiterCannotBeNewlineChar", resourceCulture); + } + } + /// /// Looks up a localized string similar to Length (excluding null values). /// @@ -114,6 +150,24 @@ internal static string DuplicateColumnName { } } + /// + /// Looks up a localized string similar to Delimiters is empty.. + /// + internal static string EmptyDelimiters { + get { + return ResourceManager.GetString("EmptyDelimiters", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to FieldWidths is empty.. + /// + internal static string EmptyFieldWidths { + get { + return ResourceManager.GetString("EmptyFieldWidths", resourceCulture); + } + } + /// /// Looks up a localized string similar to Empty file. /// @@ -123,6 +177,15 @@ internal static string EmptyFile { } } + /// + /// Looks up a localized string similar to Exceeded maximum buffer size.. + /// + internal static string ExceededMaxBufferSize { + get { + return ResourceManager.GetString("ExceededMaxBufferSize", resourceCulture); + } + } + /// /// Looks up a localized string similar to Parameter.Count exceeds the number of columns({0}) in the DataFrame . /// @@ -150,6 +213,24 @@ internal static string ExpectedEitherGuessRowsOrDataTypes { } } + /// + /// Looks up a localized string similar to {0} not found.. + /// + internal static string FileNotFound { + get { + return ResourceManager.GetString("FileNotFound", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to A double quote is not a legal delimiter when HasFieldsEnclosedInQuotes is set to True.. + /// + internal static string IllegalQuoteDelimiter { + get { + return ResourceManager.GetString("IllegalQuoteDelimiter", resourceCulture); + } + } + /// /// Looks up a localized string similar to Column is immutable. /// @@ -186,6 +267,15 @@ internal static string InvalidColumnName { } } + /// + /// Looks up a localized string similar to All field widths, except the last element, must be greater than zero. A field width less than or equal to zero in the last element indicates the last field is of variable length.. + /// + internal static string InvalidFieldWidths { + get { + return ResourceManager.GetString("InvalidFieldWidths", resourceCulture); + } + } + /// /// Looks up a localized string similar to Line {0} has less columns than expected. /// @@ -195,6 +285,15 @@ internal static string LessColumnsThatExpected { } } + /// + /// Looks up a localized string similar to Line {0} cannot be read because it exceeds the max line size.. + /// + internal static string LineExceedsMaxLineSize { + get { + return ResourceManager.GetString("LineExceedsMaxLineSize", resourceCulture); + } + } + /// /// Looks up a localized string similar to MapIndices exceeds column length. /// @@ -266,6 +365,24 @@ internal static string NotSupportedColumnType { return ResourceManager.GetString("NotSupportedColumnType", resourceCulture); } } + + /// + /// Looks up a localized string similar to Delimiters is null.. + /// + internal static string NullDelimiters { + get { + return ResourceManager.GetString("NullDelimiters", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to FieldWidths is null.. + /// + internal static string NullFieldWidths { + get { + return ResourceManager.GetString("NullFieldWidths", resourceCulture); + } + } /// /// Looks up a localized string similar to numeric column. @@ -276,6 +393,15 @@ internal static string NumericColumnType { } } + /// + /// Looks up a localized string similar to {0} must be greater than 0. + /// + internal static string PositiveNumberOfCharacters { + get { + return ResourceManager.GetString("PositiveNumberOfCharacters", resourceCulture); + } + } + /// /// Looks up a localized string similar to Cannot span multiple buffers. /// @@ -284,5 +410,14 @@ internal static string SpansMultipleBuffers { return ResourceManager.GetString("SpansMultipleBuffers", resourceCulture); } } + + /// + /// Looks up a localized string similar to Stream doesn't support reading. + /// + internal static string StreamDoesntSupportReading { + get { + return ResourceManager.GetString("StreamDoesntSupportReading", resourceCulture); + } + } } } diff --git a/src/Microsoft.Data.Analysis/strings.resx b/src/Microsoft.Data.Analysis/strings.resx index ad9f114050..de91078cec 100644 --- a/src/Microsoft.Data.Analysis/strings.resx +++ b/src/Microsoft.Data.Analysis/strings.resx @@ -120,24 +120,45 @@ Cannot cast column holding {0} values to type {1} + + Line {0} cannot be parsed with the current Delimiters. + + + Line {0} cannot be parsed with the current FieldWidths. + Cannot resize down Index cannot be greater than the Column's Length + + Comment token cannot contain whitespace + DataType + + Delimiter cannot be new line characters + Length (excluding null values) DataFrame already contains a column called {0} + + Delimiters is empty. + + + FieldWidths is empty. + Empty file + + Exceeded maximum buffer size. + Parameter.Count exceeds the number of columns({0}) in the DataFrame @@ -147,6 +168,12 @@ Expected either {0} or {1} to be provided + + {0} not found. + + + A double quote is not a legal delimiter when HasFieldsEnclosedInQuotes is set to True. + Column is immutable @@ -159,9 +186,15 @@ Column does not exist + + All field widths, except the last element, must be greater than zero. A field width less than or equal to zero in the last element indicates the last field is of variable length. + Line {0} has less columns than expected + + Line {0} cannot be read because it exceeds the max line size. + MapIndices exceeds column length @@ -186,10 +219,22 @@ {0} is not a supported column type. + + Delimiters is null. + + + FieldWidths is null. + numeric column + + {0} must be greater than 0 + Cannot span multiple buffers + + Stream doesn't support reading + \ No newline at end of file diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 4445fd2bd7..2e54ebb959 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -143,6 +143,49 @@ void ReducedRowsTest(DataFrame reducedRows) ReducedRowsTest(csvDf); } + [Fact] + public void TestReadCsvSplitAcrossMultipleLines() + { + string CMT = @"""C +MT"""; + string verifyCMT = @"C +MT"; + string data = @$"{CMT},1,1,1271,3.8,CRD,17.5 +{CMT},1,1,474,1.5,CRD,8 +{CMT},1,1,637,1.4,CRD,8.5 +{CMT},1,1,181,0.6,CSH,4.5"; + + Stream GetStream(string streamData) + { + return new MemoryStream(Encoding.Default.GetBytes(streamData)); + } + void RegularTest(DataFrame df) + { + Assert.Equal(4, df.Rows.Count); + Assert.Equal(7, df.Columns.Count); + Assert.Equal(verifyCMT, df.Columns["Column0"][3]); + VerifyColumnTypes(df); + } + + DataFrame df = DataFrame.LoadCsv(GetStream(data), header: false); + RegularTest(df); + DataFrame csvDf = DataFrame.LoadCsvFromString(data, header: false); + RegularTest(csvDf); + + void ReducedRowsTest(DataFrame reducedRows) + { + Assert.Equal(3, reducedRows.Rows.Count); + Assert.Equal(7, reducedRows.Columns.Count); + Assert.Equal(verifyCMT, reducedRows.Columns["Column0"][2]); + VerifyColumnTypes(df); + } + + DataFrame reducedRows = DataFrame.LoadCsv(GetStream(data), header: false, numberOfRowsToRead: 3); + ReducedRowsTest(reducedRows); + csvDf = DataFrame.LoadCsvFromString(data, header: false, numberOfRowsToRead: 3); + ReducedRowsTest(csvDf); + } + [Theory] [InlineData(false)] [InlineData(true)] diff --git a/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj b/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj index 2b6f43253f..118a17189f 100644 --- a/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj +++ b/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj @@ -8,6 +8,12 @@ + + @@ -27,5 +33,19 @@ True DataFrameColumn.BinaryOperationTests.tt + + True + True + Strings.resx + + + + + ResXFileCodeGenerator + Strings.Designer.cs + Microsoft.Data + + + diff --git a/test/Microsoft.Data.Analysis.Tests/Strings.Designer.cs b/test/Microsoft.Data.Analysis.Tests/Strings.Designer.cs new file mode 100644 index 0000000000..aece87fba6 --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/Strings.Designer.cs @@ -0,0 +1,414 @@ +//------------------------------------------------------------------------------ +// +// This code was generated by a tool. +// Runtime Version:4.0.30319.42000 +// +// Changes to this file may cause incorrect behavior and will be lost if +// the code is regenerated. +// +//------------------------------------------------------------------------------ + +namespace Microsoft.Data { + using System; + + + /// + /// A strongly-typed resource class, for looking up localized strings, etc. + /// + // This class was auto-generated by the StronglyTypedResourceBuilder + // class via a tool like ResGen or Visual Studio. + // To add or remove a member, edit your .ResX file then rerun ResGen + // with the /str option, or rebuild your VS project. + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "16.0.0.0")] + [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] + [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] + internal class Strings { + + private static global::System.Resources.ResourceManager resourceMan; + + private static global::System.Globalization.CultureInfo resourceCulture; + + [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] + internal Strings() { + } + + /// + /// Returns the cached ResourceManager instance used by this class. + /// + [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] + internal static global::System.Resources.ResourceManager ResourceManager { + get { + if (object.ReferenceEquals(resourceMan, null)) { + global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("Microsoft.Data.Analysis.Tests.Strings", typeof(Strings).Assembly); + resourceMan = temp; + } + return resourceMan; + } + } + + /// + /// Overrides the current thread's CurrentUICulture property for all + /// resource lookups using this strongly typed resource class. + /// + [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] + internal static global::System.Globalization.CultureInfo Culture { + get { + return resourceCulture; + } + set { + resourceCulture = value; + } + } + + /// + /// Looks up a localized string similar to Cannot cast column holding {0} values to type {1}. + /// + internal static string BadColumnCast { + get { + return ResourceManager.GetString("BadColumnCast", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Line {0} cannot be parsed with the current Delimiters.. + /// + internal static string CannotParseWithDelimiters { + get { + return ResourceManager.GetString("CannotParseWithDelimiters", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Line {0} cannot be parsed with the current FieldWidths.. + /// + internal static string CannotParseWithFieldWidths { + get { + return ResourceManager.GetString("CannotParseWithFieldWidths", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Cannot resize down. + /// + internal static string CannotResizeDown { + get { + return ResourceManager.GetString("CannotResizeDown", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Index cannot be greater than the Column's Length. + /// + internal static string ColumnIndexOutOfRange { + get { + return ResourceManager.GetString("ColumnIndexOutOfRange", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Comment token cannot contain whitespace. + /// + internal static string CommentTokenCannotContainWhitespace { + get { + return ResourceManager.GetString("CommentTokenCannotContainWhitespace", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to DataType. + /// + internal static string DataType { + get { + return ResourceManager.GetString("DataType", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Delimiter cannot be new line characters. + /// + internal static string DelimiterCannotBeNewlineChar { + get { + return ResourceManager.GetString("DelimiterCannotBeNewlineChar", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Length (excluding null values). + /// + internal static string DescriptionMethodLength { + get { + return ResourceManager.GetString("DescriptionMethodLength", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to DataFrame already contains a column called {0}. + /// + internal static string DuplicateColumnName { + get { + return ResourceManager.GetString("DuplicateColumnName", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Delimiters is empty.. + /// + internal static string EmptyDelimiters { + get { + return ResourceManager.GetString("EmptyDelimiters", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to FieldWidths is empty.. + /// + internal static string EmptyFieldWidths { + get { + return ResourceManager.GetString("EmptyFieldWidths", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Empty file. + /// + internal static string EmptyFile { + get { + return ResourceManager.GetString("EmptyFile", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Exceeded maximum buffer size.. + /// + internal static string ExceededMaxBufferSize { + get { + return ResourceManager.GetString("ExceededMaxBufferSize", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Parameter.Count exceeds the number of columns({0}) in the DataFrame . + /// + internal static string ExceedsNumberOfColumns { + get { + return ResourceManager.GetString("ExceedsNumberOfColumns", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Parameter.Count exceeds the number of rows({0}) in the DataFrame . + /// + internal static string ExceedsNumberOfRows { + get { + return ResourceManager.GetString("ExceedsNumberOfRows", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Expected either {0} or {1} to be provided. + /// + internal static string ExpectedEitherGuessRowsOrDataTypes { + get { + return ResourceManager.GetString("ExpectedEitherGuessRowsOrDataTypes", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to {0} not found.. + /// + internal static string FileNotFound { + get { + return ResourceManager.GetString("FileNotFound", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to A double quote is not a legal delimiter when HasFieldsEnclosedInQuotes is set to True.. + /// + internal static string IllegalQuoteDelimiter { + get { + return ResourceManager.GetString("IllegalQuoteDelimiter", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Column is immutable. + /// + internal static string ImmutableColumn { + get { + return ResourceManager.GetString("ImmutableColumn", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Inconsistent null bitmap and data buffer lengths. + /// + internal static string InconsistentNullBitMapAndLength { + get { + return ResourceManager.GetString("InconsistentNullBitMapAndLength", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Inconsistent null bitmaps and NullCounts. + /// + internal static string InconsistentNullBitMapAndNullCount { + get { + return ResourceManager.GetString("InconsistentNullBitMapAndNullCount", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Column does not exist. + /// + internal static string InvalidColumnName { + get { + return ResourceManager.GetString("InvalidColumnName", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to All field widths, except the last element, must be greater than zero. A field width less than or equal to zero in the last element indicates the last field is of variable length.. + /// + internal static string InvalidFieldWidths { + get { + return ResourceManager.GetString("InvalidFieldWidths", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Line {0} has less columns than expected. + /// + internal static string LessColumnsThatExpected { + get { + return ResourceManager.GetString("LessColumnsThatExpected", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Line {0} cannot be read because it exceeds the max line size.. + /// + internal static string LineExceedsMaxLineSize { + get { + return ResourceManager.GetString("LineExceedsMaxLineSize", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to MapIndices exceeds column length. + /// + internal static string MapIndicesExceedsColumnLenth { + get { + return ResourceManager.GetString("MapIndicesExceedsColumnLenth", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Column lengths are mismatched. + /// + internal static string MismatchedColumnLengths { + get { + return ResourceManager.GetString("MismatchedColumnLengths", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Expected column to hold values of type {0}. + /// + internal static string MismatchedColumnValueType { + get { + return ResourceManager.GetString("MismatchedColumnValueType", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to rowCount differs from Column length for Column . + /// + internal static string MismatchedRowCount { + get { + return ResourceManager.GetString("MismatchedRowCount", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Expected value to be of type {0}. + /// + internal static string MismatchedValueType { + get { + return ResourceManager.GetString("MismatchedValueType", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Expected value to be of type {0}, {1} or {2}. + /// + internal static string MultipleMismatchedValueType { + get { + return ResourceManager.GetString("MultipleMismatchedValueType", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Expected a seekable stream. + /// + internal static string NonSeekableStream { + get { + return ResourceManager.GetString("NonSeekableStream", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Delimiters is null.. + /// + internal static string NullDelimiters { + get { + return ResourceManager.GetString("NullDelimiters", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to FieldWidths is null.. + /// + internal static string NullFieldWidths { + get { + return ResourceManager.GetString("NullFieldWidths", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to numeric column. + /// + internal static string NumericColumnType { + get { + return ResourceManager.GetString("NumericColumnType", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to {0} must be greater than 0. + /// + internal static string PositiveNumberOfCharacters { + get { + return ResourceManager.GetString("PositiveNumberOfCharacters", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Cannot span multiple buffers. + /// + internal static string SpansMultipleBuffers { + get { + return ResourceManager.GetString("SpansMultipleBuffers", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to Stream doesn't support reading. + /// + internal static string StreamDoesntSupportReading { + get { + return ResourceManager.GetString("StreamDoesntSupportReading", resourceCulture); + } + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/Strings.resx b/test/Microsoft.Data.Analysis.Tests/Strings.resx new file mode 100644 index 0000000000..51c3562901 --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/Strings.resx @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + text/microsoft-resx + + + 2.0 + + + System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + Cannot cast column holding {0} values to type {1} + + + Line {0} cannot be parsed with the current Delimiters. + + + Line {0} cannot be parsed with the current FieldWidths. + + + Cannot resize down + + + Index cannot be greater than the Column's Length + + + Comment token cannot contain whitespace + + + DataType + + + Delimiter cannot be new line characters + + + Length (excluding null values) + + + DataFrame already contains a column called {0} + + + Delimiters is empty. + + + FieldWidths is empty. + + + Empty file + + + Exceeded maximum buffer size. + + + Parameter.Count exceeds the number of columns({0}) in the DataFrame + + + Parameter.Count exceeds the number of rows({0}) in the DataFrame + + + Expected either {0} or {1} to be provided + + + {0} not found. + + + A double quote is not a legal delimiter when HasFieldsEnclosedInQuotes is set to True. + + + Column is immutable + + + Inconsistent null bitmap and data buffer lengths + + + Inconsistent null bitmaps and NullCounts + + + Column does not exist + + + All field widths, except the last element, must be greater than zero. A field width less than or equal to zero in the last element indicates the last field is of variable length. + + + Line {0} has less columns than expected + + + Line {0} cannot be read because it exceeds the max line size. + + + MapIndices exceeds column length + + + Column lengths are mismatched + + + Expected column to hold values of type {0} + + + rowCount differs from Column length for Column + + + Expected value to be of type {0} + + + Expected value to be of type {0}, {1} or {2} + + + Expected a seekable stream + + + Delimiters is null. + + + FieldWidths is null. + + + numeric column + + + {0} must be greater than 0 + + + Cannot span multiple buffers + + + Stream doesn't support reading + + \ No newline at end of file diff --git a/test/Microsoft.Data.Analysis.Tests/TextFieldParserTests.cs b/test/Microsoft.Data.Analysis.Tests/TextFieldParserTests.cs index c7a0d5335f..c481d13b28 100644 --- a/test/Microsoft.Data.Analysis.Tests/TextFieldParserTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/TextFieldParserTests.cs @@ -361,5 +361,59 @@ public void UnmatchedQuote_MalformedLineException() Assert.Throws(() => parser.ReadFields()); } } + + [Fact] + public void ReadFields_QuoteOnNewLine() + { + string data = @"abc,""123 +123"" +def,456 +ghi,789"; + + using (var parser = new TextFieldParser(GetStream(data))) + { + Assert.Equal(1, parser.LineNumber); + + Assert.Throws(() => parser.ReadFields()); + Assert.Equal(1, parser.LineNumber); + + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { "abc",@"123 +123" }, parser.ReadFields()); + Assert.Equal(3, parser.LineNumber); + + parser.SetDelimiters(new[] { ";", "," }); + Assert.Equal(new[] { "def", "456" }, parser.ReadFields()); + Assert.Equal(4, parser.LineNumber); + + parser.SetDelimiters(new[] { "g", "9" }); + Assert.Equal(new[] { "", "hi,78", "" }, parser.ReadFields()); + Assert.Equal(-1, parser.LineNumber); + } + + data = @",, + +, +"; + + using (var parser = new TextFieldParser(GetStream(data))) + { + Assert.Equal(1, parser.LineNumber); + + parser.SetDelimiters(new[] { "," }); + Assert.Equal(new[] { "", "", "" }, parser.ReadFields()); + Assert.Equal(2, parser.LineNumber); + + // ReadFields should ignore the empty new line + Assert.Equal(new[] { "", "" }, parser.ReadFields()); + Assert.Equal(-1, parser.LineNumber); + + Assert.Null(parser.ReadFields()); + Assert.Equal(-1, parser.LineNumber); + + Assert.Null(parser.ReadFields()); + Assert.Equal(-1, parser.LineNumber); + } + } } } From b3d510dc8bc13d26116e8e3e7ef679b2cc65e0bb Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 12 Mar 2021 11:51:56 -0800 Subject: [PATCH 04/10] cleanup --- .../Microsoft.Data.Analysis.Tests.csproj | 6 ------ 1 file changed, 6 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj b/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj index 118a17189f..cc6a1d5f30 100644 --- a/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj +++ b/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj @@ -8,12 +8,6 @@ - - From 642357292afdfd8ed17dc1254a86a1c4815dee58 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Tue, 16 Mar 2021 16:22:43 -0700 Subject: [PATCH 05/10] Address feedback --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 53 ++++++++------- .../TextFieldParser.cs | 2 +- .../DataFrame.IOTests.cs | 66 ++----------------- 3 files changed, 32 insertions(+), 89 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 7951d76405..95ba7736fa 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -183,7 +183,7 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } - TextReader textReader = wrappedReader.TextReader; + TextReader textReader = wrappedReader.GetTextReader(); TextFieldParser parser = new TextFieldParser(textReader); parser.SetDelimiters(separator.ToString()); @@ -244,36 +244,38 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe DataFrame ret = new DataFrame(columns); // Fill values. - textReader = wrappedReader.TextReader; - parser = new TextFieldParser(textReader); - parser.SetDelimiters(separator.ToString()); - - rowline = 0; - while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) + using (textReader = wrappedReader.GetTextReader()) { - string[] spl = fields; - if (header && rowline == 0) - { - // Skips. - } - else + parser = new TextFieldParser(textReader); + parser.SetDelimiters(separator.ToString()); + + rowline = 0; + while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { - ret.Append(spl, inPlace: true); + string[] spl = fields; + if (header && rowline == 0) + { + // Skips. + } + else + { + ret.Append(spl, inPlace: true); + } + ++rowline; } - ++rowline; - } - if (addIndexColumn) - { - PrimitiveDataFrameColumn indexColumn = new PrimitiveDataFrameColumn("IndexColumn", columns[0].Length); - for (int i = 0; i < columns[0].Length; i++) + if (addIndexColumn) { - indexColumn[i] = i; + PrimitiveDataFrameColumn indexColumn = new PrimitiveDataFrameColumn("IndexColumn", columns[0].Length); + for (int i = 0; i < columns[0].Length; i++) + { + indexColumn[i] = i; + } + columns.Insert(0, indexColumn); } - columns.Insert(0, indexColumn); + } - textReader.Dispose(); return ret; } @@ -300,12 +302,9 @@ public WrappedStreamReaderOrStringReader(string csvString) _stream = null; } - public long InitialPosition => _initialPosition; - // Returns a new TextReader. If the wrapped object is a stream, the stream is reset to its initial position. - public TextReader TextReader + public TextReader GetTextReader() { - get { if (_stream != null) { diff --git a/src/Microsoft.Data.Analysis/TextFieldParser.cs b/src/Microsoft.Data.Analysis/TextFieldParser.cs index 7ecff015c4..e5c279db1d 100644 --- a/src/Microsoft.Data.Analysis/TextFieldParser.cs +++ b/src/Microsoft.Data.Analysis/TextFieldParser.cs @@ -148,7 +148,7 @@ internal class TextFieldParser : IDisposable private Regex _delimiterWithEndCharsRegex; - private int[] _whitespaceCodes = new int[] { '\u0020' }; + private int[] _whitespaceCodes = new int[] { '\u0009', '\u000B', '\u000C', '\u0020', '\u0085', '\u00A0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', '\u200B', '\u2028', '\u2029', '\u3000', '\uFEFF' }; private Regex _beginQuotesRegex; diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 2e54ebb959..93bf2dad54 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -101,6 +101,11 @@ internal static void VerifyColumnTypes(DataFrame df, bool testArrowStringColumn } } + private static Stream GetStream(string streamData) + { + return new MemoryStream(Encoding.Default.GetBytes(streamData)); + } + [Theory] [InlineData(false)] [InlineData(true)] @@ -114,10 +119,6 @@ public void TestReadCsvWithHeader(bool useQuotes) {CMT},1,1,637,1.4,CRD,8.5 {CMT},1,1,181,0.6,CSH,4.5"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } void RegularTest(DataFrame df) { Assert.Equal(4, df.Rows.Count); @@ -155,10 +156,6 @@ public void TestReadCsvSplitAcrossMultipleLines() {CMT},1,1,637,1.4,CRD,8.5 {CMT},1,1,181,0.6,CSH,4.5"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } void RegularTest(DataFrame df) { Assert.Equal(4, df.Rows.Count); @@ -198,10 +195,6 @@ public void TestReadCsvNoHeader(bool useQuotes) {CMT},1,1,637,1.4,CRD,8.5 {CMT},1,1,181,0.6,CSH,4.5"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } void RegularTest(DataFrame df) { Assert.Equal(4, df.Rows.Count); @@ -292,10 +285,6 @@ False 10 Null CMT,1,1,637,1.4,CRD,8.5 CMT,1,1,181,0.6,CSH,4.5"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } string data = header ? headerLine + dataLines : dataLines; DataFrame df = DataFrame.LoadCsv(GetStream(data), @@ -346,10 +335,6 @@ public void TestReadCsvWithTypes() ,,,,,, CMT,1,1,181,0.6,CSH,4.5"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } void Verify(DataFrame df) { Assert.Equal(5, df.Rows.Count); @@ -409,10 +394,6 @@ public void TestReadCsvWithPipeSeparator() |||||| CMT|1|1|181|0.6|CSH|4.5"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } void Verify(DataFrame df) { Assert.Equal(5, df.Rows.Count); @@ -452,10 +433,6 @@ public void TestReadCsvWithSemicolonSeparator() ;;;;;; CMT;1;1;181;0.6;CSH;4.5"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } void Verify(DataFrame df) { Assert.Equal(5, df.Rows.Count); @@ -494,10 +471,6 @@ public void TestReadCsvWithExtraColumnInHeader() CMT,1,1,637,1.4,CRD,8.5 CMT,1,1,181,0.6,CSH,4.5"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } void Verify(DataFrame df) { Assert.Equal(4, df.Rows.Count); @@ -527,11 +500,6 @@ public void TestReadCsvWithExtraColumnInRow() CMT,1,1,637,1.4,CRD,8.5,0 CMT,1,1,181,0.6,CSH,4.5,0"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } - Assert.Throws(() => DataFrame.LoadCsv(GetStream(data))); Assert.Throws(() => DataFrame.LoadCsvFromString(data)); } @@ -545,11 +513,6 @@ public void TestReadCsvWithLessColumnsInRow() CMT,1,1,637,1.4,CRD CMT,1,1,181,0.6,CSH"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } - void Verify(DataFrame df) { Assert.Equal(4, df.Rows.Count); @@ -581,11 +544,6 @@ public void TestReadCsvWithAllNulls() null,null,null,null null,null,null,null"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } - void Verify(DataFrame df) { Assert.Equal(6, df.Rows.Count); @@ -629,11 +587,6 @@ public void TestReadCsvWithNullsAndDataTypes() ,,, CMT,1,1,null"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } - void Verify(DataFrame df) { Assert.Equal(6, df.Rows.Count); @@ -696,11 +649,6 @@ public void TestReadCsvWithNulls() ,,, CMT,1,1,null"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } - void Verify(DataFrame df) { Assert.Equal(6, df.Rows.Count); @@ -918,10 +866,6 @@ public void TestMixedDataTypesInCsv() , CMT,"; - Stream GetStream(string streamData) - { - return new MemoryStream(Encoding.Default.GetBytes(streamData)); - } DataFrame df = DataFrame.LoadCsv(GetStream(data)); Assert.Equal(6, df.Rows.Count); Assert.Equal(2, df.Columns.Count); From a85391ce39ea75c1e5470559f132010951c7fc01 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Thu, 18 Mar 2021 13:48:00 -0700 Subject: [PATCH 06/10] Last bit of feedback --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 107 ++++++++++---------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 95ba7736fa..913be59e67 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -183,73 +183,74 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); } - TextReader textReader = wrappedReader.GetTextReader(); - TextFieldParser parser = new TextFieldParser(textReader); - parser.SetDelimiters(separator.ToString()); - - var linesForGuessType = new List(); - long rowline = 0; - int numberOfColumns = dataTypes?.Length ?? 0; - - if (header == true && numberOfRowsToRead != -1) - { - numberOfRowsToRead++; - } - List columns; - // First pass: schema and number of rows. - string[] fields; - while ((fields = parser.ReadFields()) != null) + using (var textReader = wrappedReader.GetTextReader()) { - if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) + TextFieldParser parser = new TextFieldParser(textReader); + parser.SetDelimiters(separator.ToString()); + + var linesForGuessType = new List(); + long rowline = 0; + int numberOfColumns = dataTypes?.Length ?? 0; + + if (header == true && numberOfRowsToRead != -1) { - if (linesForGuessType.Count < guessRows || (header && rowline == 0)) + numberOfRowsToRead++; + } + + // First pass: schema and number of rows. + while ((fields = parser.ReadFields()) != null) + { + if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) { - string[] spl = fields; - if (header && rowline == 0) + if (linesForGuessType.Count < guessRows || (header && rowline == 0)) { - if (columnNames == null) + string[] spl = fields; + if (header && rowline == 0) { - columnNames = spl; + if (columnNames == null) + { + columnNames = spl; + } + } + else + { + linesForGuessType.Add(spl); + numberOfColumns = Math.Max(numberOfColumns, spl.Length); } - } - else - { - linesForGuessType.Add(spl); - numberOfColumns = Math.Max(numberOfColumns, spl.Length); } } + ++rowline; + if (rowline == guessRows || guessRows == 0) + { + break; + } } - ++rowline; - if (rowline == guessRows || guessRows == 0) + + if (rowline == 0) { - break; + throw new FormatException(Strings.EmptyFile); } - } - if (rowline == 0) - { - throw new FormatException(Strings.EmptyFile); - } - - columns = new List(numberOfColumns); - // Guesses types or looks up dataTypes and adds columns. - for (int i = 0; i < numberOfColumns; ++i) - { - Type kind = dataTypes == null ? GuessKind(i, linesForGuessType) : dataTypes[i]; - columns.Add(CreateColumn(kind, columnNames, i)); + columns = new List(numberOfColumns); + // Guesses types or looks up dataTypes and adds columns. + for (int i = 0; i < numberOfColumns; ++i) + { + Type kind = dataTypes == null ? GuessKind(i, linesForGuessType) : dataTypes[i]; + columns.Add(CreateColumn(kind, columnNames, i)); + } } DataFrame ret = new DataFrame(columns); // Fill values. - using (textReader = wrappedReader.GetTextReader()) + using (var textReader = wrappedReader.GetTextReader()) { - parser = new TextFieldParser(textReader); + TextFieldParser parser = new TextFieldParser(textReader); parser.SetDelimiters(separator.ToString()); - rowline = 0; + long rowline = 0; while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { string[] spl = fields; @@ -305,16 +306,14 @@ public WrappedStreamReaderOrStringReader(string csvString) // Returns a new TextReader. If the wrapped object is a stream, the stream is reset to its initial position. public TextReader GetTextReader() { + if (_stream != null) { - if (_stream != null) - { - _stream.Seek(_initialPosition, SeekOrigin.Begin); - return new StreamReader(_stream, _encoding, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true); - } - else - { - return new StringReader(_csvString); - } + _stream.Seek(_initialPosition, SeekOrigin.Begin); + return new StreamReader(_stream, _encoding, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true); + } + else + { + return new StringReader(_csvString); } } From 3464aa51ce387993e2d5d313816fa5ffafae50b1 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 19 Mar 2021 13:39:06 -0700 Subject: [PATCH 07/10] Remove extra var --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 913be59e67..1d39d08258 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -206,18 +206,17 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe { if (linesForGuessType.Count < guessRows || (header && rowline == 0)) { - string[] spl = fields; if (header && rowline == 0) { if (columnNames == null) { - columnNames = spl; + columnNames = fields; } } else { - linesForGuessType.Add(spl); - numberOfColumns = Math.Max(numberOfColumns, spl.Length); + linesForGuessType.Add(fields); + numberOfColumns = Math.Max(numberOfColumns, fields.Length); } } } @@ -253,14 +252,13 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe long rowline = 0; while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead)) { - string[] spl = fields; if (header && rowline == 0) { // Skips. } else { - ret.Append(spl, inPlace: true); + ret.Append(fields, inPlace: true); } ++rowline; } From a3cf4360697f75de450537f4e3f56260571815c3 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 19 Mar 2021 13:53:54 -0700 Subject: [PATCH 08/10] Remove duplicate file --- .../Microsoft.Data.Analysis.Tests.csproj | 12 +- .../Strings.Designer.cs | 414 ------------------ .../Strings.resx | 237 ---------- 3 files changed, 2 insertions(+), 661 deletions(-) delete mode 100644 test/Microsoft.Data.Analysis.Tests/Strings.Designer.cs delete mode 100644 test/Microsoft.Data.Analysis.Tests/Strings.resx diff --git a/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj b/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj index cc6a1d5f30..8032812924 100644 --- a/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj +++ b/test/Microsoft.Data.Analysis.Tests/Microsoft.Data.Analysis.Tests.csproj @@ -27,19 +27,11 @@ True DataFrameColumn.BinaryOperationTests.tt - - True - True - Strings.resx - - - ResXFileCodeGenerator - Strings.Designer.cs - Microsoft.Data - + + diff --git a/test/Microsoft.Data.Analysis.Tests/Strings.Designer.cs b/test/Microsoft.Data.Analysis.Tests/Strings.Designer.cs deleted file mode 100644 index aece87fba6..0000000000 --- a/test/Microsoft.Data.Analysis.Tests/Strings.Designer.cs +++ /dev/null @@ -1,414 +0,0 @@ -//------------------------------------------------------------------------------ -// -// This code was generated by a tool. -// Runtime Version:4.0.30319.42000 -// -// Changes to this file may cause incorrect behavior and will be lost if -// the code is regenerated. -// -//------------------------------------------------------------------------------ - -namespace Microsoft.Data { - using System; - - - /// - /// A strongly-typed resource class, for looking up localized strings, etc. - /// - // This class was auto-generated by the StronglyTypedResourceBuilder - // class via a tool like ResGen or Visual Studio. - // To add or remove a member, edit your .ResX file then rerun ResGen - // with the /str option, or rebuild your VS project. - [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "16.0.0.0")] - [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] - [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] - internal class Strings { - - private static global::System.Resources.ResourceManager resourceMan; - - private static global::System.Globalization.CultureInfo resourceCulture; - - [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] - internal Strings() { - } - - /// - /// Returns the cached ResourceManager instance used by this class. - /// - [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] - internal static global::System.Resources.ResourceManager ResourceManager { - get { - if (object.ReferenceEquals(resourceMan, null)) { - global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("Microsoft.Data.Analysis.Tests.Strings", typeof(Strings).Assembly); - resourceMan = temp; - } - return resourceMan; - } - } - - /// - /// Overrides the current thread's CurrentUICulture property for all - /// resource lookups using this strongly typed resource class. - /// - [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] - internal static global::System.Globalization.CultureInfo Culture { - get { - return resourceCulture; - } - set { - resourceCulture = value; - } - } - - /// - /// Looks up a localized string similar to Cannot cast column holding {0} values to type {1}. - /// - internal static string BadColumnCast { - get { - return ResourceManager.GetString("BadColumnCast", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Line {0} cannot be parsed with the current Delimiters.. - /// - internal static string CannotParseWithDelimiters { - get { - return ResourceManager.GetString("CannotParseWithDelimiters", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Line {0} cannot be parsed with the current FieldWidths.. - /// - internal static string CannotParseWithFieldWidths { - get { - return ResourceManager.GetString("CannotParseWithFieldWidths", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Cannot resize down. - /// - internal static string CannotResizeDown { - get { - return ResourceManager.GetString("CannotResizeDown", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Index cannot be greater than the Column's Length. - /// - internal static string ColumnIndexOutOfRange { - get { - return ResourceManager.GetString("ColumnIndexOutOfRange", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Comment token cannot contain whitespace. - /// - internal static string CommentTokenCannotContainWhitespace { - get { - return ResourceManager.GetString("CommentTokenCannotContainWhitespace", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to DataType. - /// - internal static string DataType { - get { - return ResourceManager.GetString("DataType", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Delimiter cannot be new line characters. - /// - internal static string DelimiterCannotBeNewlineChar { - get { - return ResourceManager.GetString("DelimiterCannotBeNewlineChar", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Length (excluding null values). - /// - internal static string DescriptionMethodLength { - get { - return ResourceManager.GetString("DescriptionMethodLength", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to DataFrame already contains a column called {0}. - /// - internal static string DuplicateColumnName { - get { - return ResourceManager.GetString("DuplicateColumnName", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Delimiters is empty.. - /// - internal static string EmptyDelimiters { - get { - return ResourceManager.GetString("EmptyDelimiters", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to FieldWidths is empty.. - /// - internal static string EmptyFieldWidths { - get { - return ResourceManager.GetString("EmptyFieldWidths", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Empty file. - /// - internal static string EmptyFile { - get { - return ResourceManager.GetString("EmptyFile", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Exceeded maximum buffer size.. - /// - internal static string ExceededMaxBufferSize { - get { - return ResourceManager.GetString("ExceededMaxBufferSize", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Parameter.Count exceeds the number of columns({0}) in the DataFrame . - /// - internal static string ExceedsNumberOfColumns { - get { - return ResourceManager.GetString("ExceedsNumberOfColumns", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Parameter.Count exceeds the number of rows({0}) in the DataFrame . - /// - internal static string ExceedsNumberOfRows { - get { - return ResourceManager.GetString("ExceedsNumberOfRows", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Expected either {0} or {1} to be provided. - /// - internal static string ExpectedEitherGuessRowsOrDataTypes { - get { - return ResourceManager.GetString("ExpectedEitherGuessRowsOrDataTypes", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to {0} not found.. - /// - internal static string FileNotFound { - get { - return ResourceManager.GetString("FileNotFound", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to A double quote is not a legal delimiter when HasFieldsEnclosedInQuotes is set to True.. - /// - internal static string IllegalQuoteDelimiter { - get { - return ResourceManager.GetString("IllegalQuoteDelimiter", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Column is immutable. - /// - internal static string ImmutableColumn { - get { - return ResourceManager.GetString("ImmutableColumn", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Inconsistent null bitmap and data buffer lengths. - /// - internal static string InconsistentNullBitMapAndLength { - get { - return ResourceManager.GetString("InconsistentNullBitMapAndLength", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Inconsistent null bitmaps and NullCounts. - /// - internal static string InconsistentNullBitMapAndNullCount { - get { - return ResourceManager.GetString("InconsistentNullBitMapAndNullCount", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Column does not exist. - /// - internal static string InvalidColumnName { - get { - return ResourceManager.GetString("InvalidColumnName", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to All field widths, except the last element, must be greater than zero. A field width less than or equal to zero in the last element indicates the last field is of variable length.. - /// - internal static string InvalidFieldWidths { - get { - return ResourceManager.GetString("InvalidFieldWidths", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Line {0} has less columns than expected. - /// - internal static string LessColumnsThatExpected { - get { - return ResourceManager.GetString("LessColumnsThatExpected", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Line {0} cannot be read because it exceeds the max line size.. - /// - internal static string LineExceedsMaxLineSize { - get { - return ResourceManager.GetString("LineExceedsMaxLineSize", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to MapIndices exceeds column length. - /// - internal static string MapIndicesExceedsColumnLenth { - get { - return ResourceManager.GetString("MapIndicesExceedsColumnLenth", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Column lengths are mismatched. - /// - internal static string MismatchedColumnLengths { - get { - return ResourceManager.GetString("MismatchedColumnLengths", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Expected column to hold values of type {0}. - /// - internal static string MismatchedColumnValueType { - get { - return ResourceManager.GetString("MismatchedColumnValueType", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to rowCount differs from Column length for Column . - /// - internal static string MismatchedRowCount { - get { - return ResourceManager.GetString("MismatchedRowCount", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Expected value to be of type {0}. - /// - internal static string MismatchedValueType { - get { - return ResourceManager.GetString("MismatchedValueType", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Expected value to be of type {0}, {1} or {2}. - /// - internal static string MultipleMismatchedValueType { - get { - return ResourceManager.GetString("MultipleMismatchedValueType", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Expected a seekable stream. - /// - internal static string NonSeekableStream { - get { - return ResourceManager.GetString("NonSeekableStream", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Delimiters is null.. - /// - internal static string NullDelimiters { - get { - return ResourceManager.GetString("NullDelimiters", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to FieldWidths is null.. - /// - internal static string NullFieldWidths { - get { - return ResourceManager.GetString("NullFieldWidths", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to numeric column. - /// - internal static string NumericColumnType { - get { - return ResourceManager.GetString("NumericColumnType", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to {0} must be greater than 0. - /// - internal static string PositiveNumberOfCharacters { - get { - return ResourceManager.GetString("PositiveNumberOfCharacters", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Cannot span multiple buffers. - /// - internal static string SpansMultipleBuffers { - get { - return ResourceManager.GetString("SpansMultipleBuffers", resourceCulture); - } - } - - /// - /// Looks up a localized string similar to Stream doesn't support reading. - /// - internal static string StreamDoesntSupportReading { - get { - return ResourceManager.GetString("StreamDoesntSupportReading", resourceCulture); - } - } - } -} diff --git a/test/Microsoft.Data.Analysis.Tests/Strings.resx b/test/Microsoft.Data.Analysis.Tests/Strings.resx deleted file mode 100644 index 51c3562901..0000000000 --- a/test/Microsoft.Data.Analysis.Tests/Strings.resx +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - text/microsoft-resx - - - 2.0 - - - System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 - - - System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 - - - Cannot cast column holding {0} values to type {1} - - - Line {0} cannot be parsed with the current Delimiters. - - - Line {0} cannot be parsed with the current FieldWidths. - - - Cannot resize down - - - Index cannot be greater than the Column's Length - - - Comment token cannot contain whitespace - - - DataType - - - Delimiter cannot be new line characters - - - Length (excluding null values) - - - DataFrame already contains a column called {0} - - - Delimiters is empty. - - - FieldWidths is empty. - - - Empty file - - - Exceeded maximum buffer size. - - - Parameter.Count exceeds the number of columns({0}) in the DataFrame - - - Parameter.Count exceeds the number of rows({0}) in the DataFrame - - - Expected either {0} or {1} to be provided - - - {0} not found. - - - A double quote is not a legal delimiter when HasFieldsEnclosedInQuotes is set to True. - - - Column is immutable - - - Inconsistent null bitmap and data buffer lengths - - - Inconsistent null bitmaps and NullCounts - - - Column does not exist - - - All field widths, except the last element, must be greater than zero. A field width less than or equal to zero in the last element indicates the last field is of variable length. - - - Line {0} has less columns than expected - - - Line {0} cannot be read because it exceeds the max line size. - - - MapIndices exceeds column length - - - Column lengths are mismatched - - - Expected column to hold values of type {0} - - - rowCount differs from Column length for Column - - - Expected value to be of type {0} - - - Expected value to be of type {0}, {1} or {2} - - - Expected a seekable stream - - - Delimiters is null. - - - FieldWidths is null. - - - numeric column - - - {0} must be greater than 0 - - - Cannot span multiple buffers - - - Stream doesn't support reading - - \ No newline at end of file From f51cf2e0b836c0977491d7e6740c1a0f400a2bc2 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Fri, 19 Mar 2021 14:52:03 -0700 Subject: [PATCH 09/10] Rename strings.resx to Strings.resx --- src/Microsoft.Data.Analysis/{strings.resx => Strings.resx} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/Microsoft.Data.Analysis/{strings.resx => Strings.resx} (100%) diff --git a/src/Microsoft.Data.Analysis/strings.resx b/src/Microsoft.Data.Analysis/Strings.resx similarity index 100% rename from src/Microsoft.Data.Analysis/strings.resx rename to src/Microsoft.Data.Analysis/Strings.resx From 90c28ecc352a5858ef10b35c7bc92fc8880c72d2 Mon Sep 17 00:00:00 2001 From: Prashanth Govindarajan Date: Mon, 22 Mar 2021 10:11:59 -0700 Subject: [PATCH 10/10] rename the designer.cs file too --- .../{strings.Designer.cs => Strings.Designer.cs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/Microsoft.Data.Analysis/{strings.Designer.cs => Strings.Designer.cs} (100%) diff --git a/src/Microsoft.Data.Analysis/strings.Designer.cs b/src/Microsoft.Data.Analysis/Strings.Designer.cs similarity index 100% rename from src/Microsoft.Data.Analysis/strings.Designer.cs rename to src/Microsoft.Data.Analysis/Strings.Designer.cs