[MKDoc-commit] [mkd-import] script to validate CSV databases before import

bruno at mkdoc.demon.co.uk bruno at mkdoc.demon.co.uk
Mon Jul 25 17:38:28 BST 2005


Log Message:
-----------
[mkd-import] script to validate CSV databases before import

Added Files:
-----------
    mkd-import/examples/ukaf:
        ukaf-checker.pl

-------------- next part --------------
--- /dev/null
+++ examples/ukaf/ukaf-checker.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use File::Temp qw/ :POSIX /;
+
+# take the path to the 'data' directory from the command line
+
+my $datadir = shift || 'examples/ukaf/data';
+   $datadir =~ s/\/+$//;
+
+my $file = { agency => "$datadir/Agency.txt",
+               area => "$datadir/Area.txt",
+              codes => "$datadir/CODES.txt" };
+
+print STDERR "\nLooking for CSV database in '$datadir'...\n";
+
+# check for existence of exported database.  Three files are required called
+# Agency.txt, Area.txt and CODES.txt.
+
+open AGENCY, "<:encoding(utf8)", $file->{agency} or die "Can't find $file->{agency}: $!";
+open AREA,   "<:encoding(utf8)", $file->{area}   or die "Can't find $file->{area}: $!";
+open CODES,  "<:encoding(utf8)", $file->{codes}  or die "Can't find $file->{codes}: $!";
+
+my @agency = <AGENCY>; close AGENCY;
+my @area   = <AREA>;   close AREA;
+my @codes  = <CODES>;  close CODES;
+
+print STDERR "CSV files found OK.\n\n";
+
+# check that CSV headers are in place and correct
+
+print STDERR "Looking for required database fields...\n";
+
+my $agency_headers = '"RecordId","Status","OrgName","Address1","Address2","Address3","Address4","ConfidentialAddress","PostCode","PublicPhone","AdminPhone","Minicom","Email","Website","Monday","Tuesday","Wednesday","Thursday","Friday","Weekends","OfficeHours","TargetGroup","AreaServed","ServiceOffered","HowtoContact","Languages","TypeofOrganisation","Fax","WheelchairAccess","AdaptedToilets","AccessText","PublicTransport","YearEstablished","Staffing","CharityNo","LocalAuthority","LastUpdated"';
+my $area_headers   = '"RecordId","Code"';
+my $codes_headers  = '"CombinedCode","Code","Description","OrderID"';
+
+$agency[0] =~ /$agency_headers/ or die "unknown database fields in $file->{agency}";
+print STDERR "'Agency' fields OK.\n";
+
+$area[0]   =~ /$area_headers/   or die "unknown database fields in $file->{area}";
+print STDERR "'Area' fields OK.\n";
+
+$codes[0]  =~ /$codes_headers/  or die "unknown database fields in $file->{codes}";
+print STDERR "'CODES' fields OK.\n\n";
+
+# check that files are UTF-8 not CP1252
+
+print STDERR "Checking that data is Unicode UTF-8...\n";
+
+my $pound_sign = "\x{00A3}";
+my $utf8_ok = "FALSE";
+
+for (@agency)
+{
+    $utf8_ok = "TRUE" if $_ =~ /$pound_sign/;
+}
+
+if ($utf8_ok eq "TRUE")
+{
+    print STDERR "'Agency' table contains UTF-8 characters, OK.\n\n";
+}
+
+# convert from CP1252 to UTF-8 if necessary
+
+if ($utf8_ok eq "FALSE")
+{
+    print STDERR "'Agency' table is not UTF-8.  Converting from CP1252...\n";
+    my $tempname = tmpnam();
+    `iconv -f CP1252 -t utf-8 -o $tempname $file->{agency}`;
+    `cp $tempname $file->{agency}`;
+    print STDERR "'Agency' table converted, OK.\n\n";
+}
+
+1;
+


More information about the MKDoc-commit mailing list