|
using System; |
|
using System.Collections.Generic; |
|
using System.IO; |
|
using System.Linq.Expressions; |
|
using System.Management.Automation; |
|
using System.Reflection; |
|
using System.Text; |
|
|
|
/// <summary> |
|
/// Helper class to import single CSV file. |
|
/// </summary> |
|
public static class ImportCsvHelper |
|
{ |
|
// Initial sizes of the value list and the line stringbuilder. |
|
// Set to reasonable initial sizes. They may grow beyond these, |
|
// but this will prevent a few reallocations. |
|
private const int ValueCountGuestimate = 16; |
|
private const int LineLengthGuestimate = 256; |
|
|
|
public static IEnumerable<T> Read<T>(string path, Encoding encoding) => Read<T>(path, encoding: encoding); |
|
|
|
public static IEnumerable<T> Read<T>(string path, char delimiter = ',', Encoding? encoding = null) |
|
where T : class, new() |
|
{ |
|
using StreamReader sr = new(path, encoding ?? Encoding.UTF8); |
|
string[] header = ReadHeader(sr, delimiter); |
|
List<string> values = new(ValueCountGuestimate); |
|
StringBuilder builder = new(LineLengthGuestimate); |
|
Setter<T>[] setters = [.. GetSetters<T>(header)]; |
|
|
|
while (true) |
|
{ |
|
ParseNextRecord(sr, values, builder, delimiter); |
|
if (values.Count == 0) |
|
break; |
|
|
|
if (values.Count == 1 && string.IsNullOrEmpty(values[0])) |
|
{ |
|
// skip the blank lines |
|
continue; |
|
} |
|
|
|
T outobj = new(); |
|
for (int i = 0; i < setters.Length; i++) |
|
{ |
|
setters[i].SetValue(outobj, values[i]); |
|
} |
|
|
|
yield return outobj; |
|
} |
|
} |
|
|
|
record struct Setter<T>(Action<T, object> Action, Type TargetType) |
|
{ |
|
private static readonly Type s_stringType = typeof(string); |
|
private static readonly Type s_objectType = typeof(object); |
|
internal readonly void SetValue(T target, string value) => |
|
Action(target, TargetType == s_stringType || TargetType == s_objectType |
|
? value : LanguagePrimitives.ConvertTo(value, TargetType)); |
|
} |
|
|
|
private static IEnumerable<Setter<T>> GetSetters<T>(string[] properties) |
|
{ |
|
Type type = typeof(T); |
|
foreach (string property in properties) |
|
{ |
|
PropertyInfo info = type.GetProperty( |
|
property, |
|
BindingFlags.IgnoreCase | BindingFlags.Instance | BindingFlags.Public) |
|
?? throw new ArgumentNullException($"Property '{property}' not found on type '{type}'."); |
|
|
|
if (!info.CanWrite) |
|
throw new ArgumentException($"Property '{property}' on '{type}' is read-only."); |
|
|
|
MethodInfo setter = info.GetSetMethod() |
|
?? throw new ArgumentException($"Setter for '{property}' is inaccessible."); |
|
|
|
ParameterExpression target = Expression.Parameter(type, "target"); |
|
ParameterExpression value = Expression.Parameter(typeof(object), "value"); |
|
MethodCallExpression body = Expression.Call(target, setter, Expression.Convert(value, info.PropertyType)); |
|
yield return new( |
|
Expression.Lambda<Action<T, object>>(body, target, value).Compile(), |
|
info.PropertyType); |
|
} |
|
} |
|
|
|
private static char ReadChar(StreamReader sr) |
|
{ |
|
if (sr.EndOfStream) |
|
{ |
|
throw new InvalidOperationException("EOF is reached."); |
|
} |
|
|
|
int i = sr.Read(); |
|
return (char)i; |
|
} |
|
|
|
/// <summary> |
|
/// Peeks the next character in the stream and returns true if it is same as passed in character. |
|
/// </summary> |
|
/// <param name="c"></param> |
|
/// <returns></returns> |
|
private static bool PeekNextChar(StreamReader sr, char c) |
|
{ |
|
int i = sr.Peek(); |
|
if (i == -1) |
|
{ |
|
return false; |
|
} |
|
|
|
return c == (char)i; |
|
} |
|
|
|
internal static string[] ReadHeader(StreamReader sr, char delimiter) |
|
{ |
|
List<string> values = new(ValueCountGuestimate); |
|
StringBuilder builder = new(LineLengthGuestimate); |
|
while (!sr.EndOfStream) |
|
{ |
|
ParseNextRecord(sr, values, builder, delimiter); |
|
|
|
// Trim all trailing blankspaces and delimiters ( single/multiple ). |
|
// If there is only one element in the row and if its a blankspace we dont trim it. |
|
// A trailing delimiter is represented as a blankspace while being added to result collection |
|
// which is getting trimmed along with blankspaces supplied through the CSV in the below loop. |
|
while (values.Count > 1 && values[^1].Equals(string.Empty)) |
|
{ |
|
values.RemoveAt(values.Count - 1); |
|
} |
|
|
|
// File starts with '#' and contains '#Fields:' is W3C Extended Log File Format |
|
if (values.Count != 0 && values[0].StartsWith("#Fields: ")) |
|
{ |
|
values[0] = values[0][9..]; |
|
break; |
|
} |
|
else if (values.Count != 0 && values[0].StartsWith('#')) |
|
{ |
|
// Skip all lines starting with '#' |
|
} |
|
else |
|
{ |
|
// This is not W3C Extended Log File Format |
|
// By default first line is Header |
|
break; |
|
} |
|
} |
|
|
|
ValidatePropertyNames(values); |
|
return [.. values]; |
|
} |
|
|
|
/// <summary> |
|
/// Validate the names of properties. |
|
/// </summary> |
|
/// <param name="names"></param> |
|
private static void ValidatePropertyNames(List<string> names) |
|
{ |
|
if (names != null) |
|
{ |
|
if (names.Count == 0) |
|
{ |
|
// If there are no names, it is an error |
|
} |
|
else |
|
{ |
|
HashSet<string> headers = new(StringComparer.OrdinalIgnoreCase); |
|
foreach (string currentHeader in names) |
|
{ |
|
if (!string.IsNullOrEmpty(currentHeader)) |
|
{ |
|
if (!headers.Add(currentHeader)) |
|
{ |
|
// throw a terminating error as there are duplicate headers in the input. |
|
throw new ArgumentOutOfRangeException($"'{currentHeader}' is duplicated in header."); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
/// <summary> |
|
/// Reads the next record from the file and returns parsed collection of string. |
|
/// </summary> |
|
/// <returns> |
|
/// Parsed collection of strings. |
|
/// </returns> |
|
private static void ParseNextRecord( |
|
StreamReader sr, |
|
List<string> result, |
|
StringBuilder current, |
|
char delimiter) |
|
{ |
|
result.Clear(); |
|
|
|
// current string |
|
current.Clear(); |
|
|
|
bool seenBeginQuote = false; |
|
|
|
while (!sr.EndOfStream) |
|
{ |
|
// Read the next character |
|
char ch = ReadChar(sr); |
|
|
|
if (ch == delimiter) |
|
{ |
|
if (seenBeginQuote) |
|
{ |
|
// Delimiter inside double quotes is part of string. |
|
// Ex: |
|
// "foo, bar" |
|
// is parsed as |
|
// ->foo, bar<- |
|
current.Append(ch); |
|
} |
|
else |
|
{ |
|
// Delimiter outside quotes is end of current word. |
|
result.Add(current.ToString()); |
|
current.Remove(0, current.Length); |
|
} |
|
} |
|
else if (ch == '"') |
|
{ |
|
if (seenBeginQuote) |
|
{ |
|
if (PeekNextChar(sr, '"')) |
|
{ |
|
// "" inside double quote are single quote |
|
// ex: "foo""bar" |
|
// is read as |
|
// ->foo"bar<- |
|
|
|
// PeekNextChar only peeks. Read the next char. |
|
ReadChar(sr); |
|
current.Append('"'); |
|
} |
|
else |
|
{ |
|
// We have seen a matching end quote. |
|
seenBeginQuote = false; |
|
|
|
// Read |
|
// everything till we hit next delimiter. |
|
// In correct CSV,1) end quote is followed by delimiter |
|
// 2)end quote is followed some whitespaces and |
|
// then delimiter. |
|
// We eat the whitespaces seen after the ending quote. |
|
// However if there are other characters, we add all of them |
|
// to string. |
|
// Ex: ->"foo bar"<- is read as ->foo bar<- |
|
// ->"foo bar" <- is read as ->foo bar<- |
|
// ->"foo bar" ab <- is read as ->"foo bar" ab <- |
|
bool endofRecord = false; |
|
ReadTillNextDelimiter(sr, current, ref endofRecord, true, delimiter); |
|
result.Add(current.ToString()); |
|
current.Remove(0, current.Length); |
|
if (endofRecord) |
|
break; |
|
} |
|
} |
|
else if (current.Length == 0) |
|
{ |
|
// We are at the beginning of a new word. |
|
// This quote is the first quote. |
|
seenBeginQuote = true; |
|
} |
|
else |
|
{ |
|
// We are seeing a quote after the start of |
|
// the word. This is error, however we will be |
|
// lenient here and do what excel does: |
|
// Ex: foo "ba,r" |
|
// In above example word read is ->foo "ba<- |
|
// Basically we read till next delimiter |
|
bool endOfRecord = false; |
|
current.Append(ch); |
|
ReadTillNextDelimiter(sr, current, ref endOfRecord, false, delimiter); |
|
result.Add(current.ToString()); |
|
current.Remove(0, current.Length); |
|
if (endOfRecord) |
|
break; |
|
} |
|
} |
|
else if (ch == ' ' || ch == '\t') |
|
{ |
|
if (seenBeginQuote) |
|
{ |
|
// Spaces in side quote are valid |
|
current.Append(ch); |
|
} |
|
else if (current.Length == 0) |
|
{ |
|
// ignore leading spaces |
|
continue; |
|
} |
|
else |
|
{ |
|
// We are not in quote and we are not at the |
|
// beginning of a word. We should not be seeing |
|
// spaces here. This is an error condition, however |
|
// we will be lenient here and do what excel does, |
|
// that is read till next delimiter. |
|
// Ex: ->foo <- is read as ->foo<- |
|
// Ex: ->foo bar<- is read as ->foo bar<- |
|
// Ex: ->foo bar <- is read as ->foo bar <- |
|
// Ex: ->foo bar "er,ror"<- is read as ->foo bar "er<- |
|
bool endOfRecord = false; |
|
current.Append(ch); |
|
ReadTillNextDelimiter(sr, current, ref endOfRecord, true, delimiter); |
|
result.Add(current.ToString()); |
|
current.Remove(0, current.Length); |
|
|
|
if (endOfRecord) |
|
{ |
|
break; |
|
} |
|
} |
|
} |
|
else if (IsNewLine(sr, ch, out string newLine)) |
|
{ |
|
if (seenBeginQuote) |
|
{ |
|
// newline inside quote are valid |
|
current.Append(newLine); |
|
} |
|
else |
|
{ |
|
result.Add(current.ToString()); |
|
current.Remove(0, current.Length); |
|
|
|
// New line outside quote is end of word and end of record |
|
break; |
|
} |
|
} |
|
else |
|
{ |
|
current.Append(ch); |
|
} |
|
} |
|
|
|
if (current.Length != 0) |
|
{ |
|
result.Add(current.ToString()); |
|
} |
|
} |
|
|
|
// If we detect a newline we return it as a string "\r", "\n" or "\r\n" |
|
private static bool IsNewLine(StreamReader sr, char ch, out string newLine) |
|
{ |
|
newLine = string.Empty; |
|
if (ch == '\r') |
|
{ |
|
if (PeekNextChar(sr, '\n')) |
|
{ |
|
ReadChar(sr); |
|
newLine = "\r\n"; |
|
} |
|
else |
|
{ |
|
newLine = "\r"; |
|
} |
|
} |
|
else if (ch == '\n') |
|
{ |
|
newLine = "\n"; |
|
} |
|
|
|
return newLine != string.Empty; |
|
} |
|
|
|
/// <summary> |
|
/// This function reads the characters till next delimiter and adds them to current. |
|
/// </summary> |
|
/// <param name="current"></param> |
|
/// <param name="endOfRecord"> |
|
/// This is true if end of record is reached |
|
/// when delimiter is hit. This would be true if delimiter is NewLine. |
|
/// </param> |
|
/// <param name="eatTrailingBlanks"> |
|
/// If this is true, eat the trailing blanks. Note:if there are non |
|
/// whitespace characters present, then trailing blanks are not consumed. |
|
/// </param> |
|
private static void ReadTillNextDelimiter( |
|
StreamReader sr, |
|
StringBuilder current, |
|
ref bool endOfRecord, |
|
bool eatTrailingBlanks, |
|
char delimiter) |
|
{ |
|
StringBuilder temp = new(); |
|
|
|
// Did we see any non-whitespace character |
|
bool nonWhiteSpace = false; |
|
|
|
while (true) |
|
{ |
|
if (sr.EndOfStream) |
|
{ |
|
endOfRecord = true; |
|
break; |
|
} |
|
|
|
char ch = ReadChar(sr); |
|
|
|
if (ch == delimiter) |
|
{ |
|
break; |
|
} |
|
else if (IsNewLine(sr, ch, out string _)) |
|
{ |
|
endOfRecord = true; |
|
break; |
|
} |
|
else |
|
{ |
|
temp.Append(ch); |
|
if (ch != ' ' && ch != '\t') |
|
{ |
|
nonWhiteSpace = true; |
|
} |
|
} |
|
} |
|
|
|
if (eatTrailingBlanks && !nonWhiteSpace) |
|
{ |
|
string s = temp.ToString(); |
|
s = s.Trim(); |
|
current.Append(s); |
|
} |
|
else |
|
{ |
|
current.Append(temp); |
|
} |
|
} |
|
} |