// Spludlow Software // Copyright © Samuel P. Ludlow 2020 All Rights Reserved // Distributed under the terms of the GNU General Public License version 3 // Distributed WITHOUT ANY WARRANTY; without implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE // https://www.spludlow.co.uk/LICENCE.TXT // The Spludlow logo is a registered trademark of Samuel P. Ludlow and may not be used without permission // v1.14 using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.IO; using System.Data; namespace Spludlow.Data { public class BasicCompanyData { // http://download.companieshouse.gov.uk/en_output.html private static Encoding encoding = Encoding.UTF8; /// /// Read table from part CSV file, ditch unwanted columns and fix datatypes based on mapping table /// public static DataTable CleanTable(string csvFilename, Encoding csvEncoding, DataTable columnMappingTable) { // Fix empty datatypes to String foreach (DataRow row in columnMappingTable.Rows) { if (row.IsNull("TargetName") == false && row.IsNull("DataType") == true) row["DataType"] = "String"; } // Create Result table DataTable resultTable = new DataTable(); foreach (DataRow row in columnMappingTable.Rows) { if (row.IsNull("TargetName") == true) continue; resultTable.Columns.Add((string)row["TargetName"], Type.GetType("System." + (string)row["DataType"], true)); } DataTable sourceTable = Spludlow.Data.CSV.Read(csvFilename, csvEncoding); // Copy rows to target table based on mapping table foreach (DataRow sourceRow in sourceTable.Rows) { DataRow targetRow = resultTable.NewRow(); foreach (DataRow mapRow in columnMappingTable.Rows) { if (mapRow.IsNull("TargetName") == true) continue; string dataType = (string)mapRow["DataType"]; string sourceName = (string)mapRow["SourceName"]; string targetName = (string)mapRow["TargetName"]; string textData = (string)sourceRow[sourceName]; object result = DBNull.Value; switch (dataType) { case "DateTime": if (textData.Length > 0) result = Convert.ToDateTime(textData); break; case "Int32": if (textData.Length > 0) result = Convert.ToInt32(textData); break; case "String": result = textData; break; default: throw new ApplicationException("CombineTables; Unsupported datatype: " + dataType); } targetRow[targetName] = result; } resultTable.Rows.Add(targetRow); } return resultTable; } /// /// Combine the 4 Normalized SIC codes into a single table /// public static void CombineSIC(string normalizationDirectory, string targetFilename) { int sicCount = 4; DataTable resultTable = new DataTable(); resultTable.Columns.Add("SICCode", typeof(int)); resultTable.Columns.Add("SICText", typeof(string)); resultTable.PrimaryKey = new DataColumn[] { resultTable.Columns[0] }; for (int sicIndex = 0; sicIndex < sicCount; ++sicIndex) { DataTable table = Spludlow.Data.TextTable.ReadFile(normalizationDirectory + @"\SicText" + (sicIndex + 1).ToString() + ".txt", encoding); foreach (DataRow sourceRow in table.Rows) { string sicText = (string)sourceRow[1]; if (sicText == "") continue; int sicCode = 0; if (sicText != "None Supplied") { int index = sicText.IndexOf("-"); if (index == -1) throw new ApplicationException("CombineSIC; Bad SIC Text: " + sicText); sicCode = Int32.Parse(sicText.Substring(0, index).Trim()); sicText = sicText.Substring(index + 1).Trim(); } DataRow resultRow = resultTable.Rows.Find(sicCode); if (resultRow == null) resultTable.Rows.Add(new object[] { sicCode, sicText }); } } DataView view = new DataView(resultTable); view.Sort = "SICCode"; Spludlow.Data.TextTable.Write(targetFilename, view, encoding); } /// /// Create Final lookup tables from nornalized tables /// public static void FinalNormalTables(string normalizationDirectory, string targetDirectory) { MakeFinalTable4CharPK(normalizationDirectory + @"\AccountCategory.txt", targetDirectory, "AccountCategories", "AccountCategoryId"); MakeFinalTable4CharPK(normalizationDirectory + @"\CompanyCategory.txt", targetDirectory, "CompanyCategories", "CompanyCategoryId"); MakeFinalTable4CharPK(normalizationDirectory + @"\CompanyStatus.txt", targetDirectory, "CompanyStatuses", "CompanyStatusId"); MakeFinalTableInt32PK(normalizationDirectory + @"\CountryOfOrigin.txt", targetDirectory, "CountryOfOrigins", "CountryOfOriginId"); } public static void MakeFinalTable4CharPK(string sourceFilename, string finalDirectory, string tableName, string idColumnName) { string filename = finalDirectory + @"\" + tableName + ".txt"; DataTable sourceTable = Spludlow.Data.TextTable.ReadFile(sourceFilename, encoding); DataTable targetTable = new DataTable(); targetTable.TableName = tableName; targetTable.Columns.Add(idColumnName, typeof(string)); targetTable.Columns.Add("Description", typeof(string)); targetTable.PrimaryKey = new DataColumn[] { targetTable.Columns[0] }; foreach (DataRow sourceRow in sourceTable.Rows) { string description = (string)sourceRow[1]; string[] parts = description.Split(new char[] { ' ', '/', '(', ')', '-', ',', '.', '\'' }, StringSplitOptions.RemoveEmptyEntries); string code; switch (parts.Length) { case 0: code = " "; break; case 1: code = description.Substring(0, 4); break; case 2: code = parts[0].Substring(0, 2) + parts[1].Substring(0, 2); break; case 3: code = parts[0].Substring(0, 2) + parts[1].Substring(0, 1) + parts[2].Substring(0, 1); break; default: code = parts[0].Substring(0, 1) + parts[1].Substring(0, 1) + parts[2].Substring(0, 1) + parts[parts.Length - 1].Substring(0, 1); break; } targetTable.Rows.Add(new object[] { code.ToUpper(), description }); } Spludlow.Data.TextTable.Write(filename, targetTable, encoding); } public static void MakeFinalTableInt32PK(string sourceFilename, string finalDirectory, string tableName, string idColumnName) { DataTable sourceTable = Spludlow.Data.TextTable.ReadFile(sourceFilename, encoding); DataTable targetTable = new DataTable(); targetTable.TableName = tableName; targetTable.Columns.Add(idColumnName, typeof(int)); targetTable.Columns.Add("Description", typeof(string)); targetTable.PrimaryKey = new DataColumn[] { targetTable.Columns[0] }; for (int index = 0; index < sourceTable.Rows.Count; ++index) { string description = (string)sourceTable.Rows[index][1]; targetTable.Rows.Add(new object[] { (index + 1), description }); } Spludlow.Data.TextTable.Write(finalDirectory + @"\" + tableName + ".txt", targetTable, encoding); } public static void MakeSICResolver(string[] sourceFilenames, string targetFilename) { DataTable resolverTable = Spludlow.Data.TextTable.ReadText(new string[] { "CompanyNumber SICCode Position", "String* Int32* Int32", }); resolverTable.TableName = "CompaniesSIC"; foreach (string sourceFilename in sourceFilenames) { DataTable table = Spludlow.Data.TextTable.ReadFile(sourceFilename, encoding); foreach (DataRow row in table.Rows) { string companyNumber = (string)row["CompanyNumber"]; for (int position = 1; position <= 4; ++position) { string sicText = (string)row["SicText" + position]; if (sicText.Length == 0) continue; int sicCode = 0; if (sicText != "None Supplied") { int index = sicText.IndexOf("-"); if (index == -1) throw new ApplicationException("CombineSIC; Bad SIC Text: " + sicText); sicCode = Int32.Parse(sicText.Substring(0, index).Trim()); } resolverTable.Rows.Add(companyNumber, sicCode, position); } } } Spludlow.Data.TextTable.Write(targetFilename, resolverTable, encoding); } public static void MakeCompaniesTableParts(string[] sourceDataFilenames, string sourceDirectory, string targetDirectory, DataTable mappingTable, string schemaDirectory) { DataSet schema = Spludlow.Data.Schemas.ReadDirectory(schemaDirectory); Dictionary> reverseLookups = new Dictionary>(); foreach (DataRow mapRow in mappingTable.Rows) { string tableName = (string)mapRow["TableName"]; string keyName = (string)mapRow["KeyName"]; DataTable table = Spludlow.Data.TextTable.ReadFile(sourceDirectory + @"\" + tableName + ".txt", encoding); Dictionary reverseLookup = new Dictionary(StringComparer.OrdinalIgnoreCase); foreach (DataRow row in table.Rows) reverseLookup.Add((string)row[1], row[0]); reverseLookups.Add(keyName, reverseLookup); } foreach (string sourceFilename in sourceDataFilenames) { string targetFilename = targetDirectory + @"\" + Path.GetFileName(sourceFilename); DataTable sourceTable = Spludlow.Data.TextTable.ReadFile(sourceFilename, encoding); DataTable targetTable = Spludlow.Data.ADO.TableCreate("Companies", schema); foreach (DataRow sourceRow in sourceTable.Rows) { DataRow targetRow = targetTable.NewRow(); foreach (DataColumn column in targetTable.Columns) { DataRow mapRow = mappingTable.Rows.Find(column.ColumnName); if (mapRow == null) { targetRow[column] = sourceRow[column.ColumnName]; } else { string keyName = (string)mapRow["KeyName"]; string sourceColumnName = (string)mapRow["SourceColumn"]; string sourceValue = (string)sourceRow[sourceColumnName]; targetRow[column] = reverseLookups[keyName][sourceValue]; } } targetTable.Rows.Add(targetRow); } Spludlow.Data.TextTable.Write(targetFilename, targetTable, encoding); } } public static void ImportDatabase(string sourceDirectory, string companySourceDirectory, string schemaDirectory, string connectionString, string databaseName, string tempDirectory) { Spludlow.Data.IDAL database = Spludlow.Data.DAL.Create(connectionString); if (database.DatabaseExists(databaseName) == true) database.DatabaseDelete(databaseName); database.DatabaseCreate(databaseName); database.ChangeDatabase(databaseName); DataSet schema = Spludlow.Data.Schemas.ReadDirectory(schemaDirectory); string[] tableNames = Spludlow.Data.Schemas.TableNamesInOrder(schema); foreach (string tableName in tableNames) database.TableCreate(tableName, schema); database.TableAddKeys(schema, "UI"); foreach (string tableName in tableNames) { if (tableName == "Companies") continue; string dataFilename = sourceDirectory + @"\" + tableName + ".txt"; DataTable table = Spludlow.Data.TextTable.ReadFile(dataFilename, encoding); database.BulkInsert(table, tableName, tempDirectory); Spludlow.Log.Info("BasicCompanyData, ImportDatabase; Done table: " + tableName + ", rows: " + table.Rows.Count); } string[] companyFilenames = Directory.GetFiles(companySourceDirectory, "*.txt"); foreach (string companyFilename in companyFilenames) { DataTable table = Spludlow.Data.TextTable.ReadFile(companyFilename, encoding); database.BulkInsert(table, "Companies", tempDirectory); Spludlow.Log.Info("BasicCompanyData, ImportDatabase; Done company table: " + companyFilename + ", rows: " + table.Rows.Count); } database.CommandTimeout(240); database.TableAddKeys(schema, "F"); database.Operation("shrink log"); } // Create mapping file seperate !!!! // string[] lines = System.IO.File.ReadAllLines(@"C:\BasicCompanyData\ColumnNames.txt"); // lines = Spludlow.Text.Swivel(lines, false); //File.WriteAllLines(@"C:\BasicCompanyData\ColumnMapping.txt", lines); public static void CreateFullCleanSchema(string cleanDirectory, string combinedCompaniesFilename, string schemaDirectory) // Needs plenty of Memory { if (File.Exists(combinedCompaniesFilename) == true) File.Delete(combinedCompaniesFilename); string[] sourceFilenames = Directory.GetFiles(cleanDirectory, "*.txt"); if (sourceFilenames.Length != 5) throw new ApplicationException("Wrong number of TXT files than expected in directory: " + cleanDirectory); Spludlow.Data.TextTable.CombineTables(sourceFilenames, combinedCompaniesFilename, encoding); DataTable table = Spludlow.Data.TextTable.ReadFile(combinedCompaniesFilename, encoding); table.TableName = "Companies"; table.PrimaryKey = new DataColumn[] { table.Columns["CompanyNumber"] }; DataSet schema = Spludlow.Data.ADO.Schema(table, true); Spludlow.Data.TextTable.WriteDirectory(schemaDirectory, schema, encoding); } public static void MakeNormalTablesSchema(string finalDirectory, string normalSchemaDirectory) { string[] dataFilenames = Directory.GetFiles(finalDirectory, "*.txt"); DataSet dataSet = new DataSet(); foreach (string dataFilename in dataFilenames) { string tableName = Path.GetFileNameWithoutExtension(dataFilename); if (tableName == "Companies") continue; DataTable table = Spludlow.Data.TextTable.ReadFile(dataFilename, encoding); table.TableName = tableName; dataSet.Tables.Add(table); } DataSet schema = Spludlow.Data.ADO.Schema(dataSet, true); Spludlow.Data.TextTable.WriteDirectory(normalSchemaDirectory, schema); } public static void CombineFinalCompaniesTable(string finalPartsDirectory, string finalFilename) { if (File.Exists(finalFilename) == true) return; string[] sourceFilenames = Directory.GetFiles(finalPartsDirectory, "*.txt"); if (sourceFilenames.Length != 5) throw new ApplicationException("Wrong number of TXT files than expected in directory: " + finalPartsDirectory); Spludlow.Data.TextTable.CombineTables(sourceFilenames, finalFilename, encoding); } } }