[MKDoc-commit] [mkd-import] script to validate CSV databases before
import
bruno at mkdoc.demon.co.uk
bruno at mkdoc.demon.co.uk
Mon Jul 25 17:38:28 BST 2005
Log Message:
-----------
[mkd-import] script to validate CSV databases before import
Added Files:
-----------
mkd-import/examples/ukaf:
ukaf-checker.pl
-------------- next part --------------
--- /dev/null
+++ examples/ukaf/ukaf-checker.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use File::Temp qw/ :POSIX /;
+
+# take the path to the 'data' directory from the command line
+
+my $datadir = shift || 'examples/ukaf/data';
+ $datadir =~ s/\/+$//;
+
+my $file = { agency => "$datadir/Agency.txt",
+ area => "$datadir/Area.txt",
+ codes => "$datadir/CODES.txt" };
+
+print STDERR "\nLooking for CSV database in '$datadir'...\n";
+
+# check for existence of exported database. Three files are required called
+# Agency.txt, Area.txt and CODES.txt.
+
+open AGENCY, "<:encoding(utf8)", $file->{agency} or die "Can't find $file->{agency}: $!";
+open AREA, "<:encoding(utf8)", $file->{area} or die "Can't find $file->{area}: $!";
+open CODES, "<:encoding(utf8)", $file->{codes} or die "Can't find $file->{codes}: $!";
+
+my @agency = <AGENCY>; close AGENCY;
+my @area = <AREA>; close AREA;
+my @codes = <CODES>; close CODES;
+
+print STDERR "CSV files found OK.\n\n";
+
+# check that CSV headers are in place and correct
+
+print STDERR "Looking for required database fields...\n";
+
+my $agency_headers = '"RecordId","Status","OrgName","Address1","Address2","Address3","Address4","ConfidentialAddress","PostCode","PublicPhone","AdminPhone","Minicom","Email","Website","Monday","Tuesday","Wednesday","Thursday","Friday","Weekends","OfficeHours","TargetGroup","AreaServed","ServiceOffered","HowtoContact","Languages","TypeofOrganisation","Fax","WheelchairAccess","AdaptedToilets","AccessText","PublicTransport","YearEstablished","Staffing","CharityNo","LocalAuthority","LastUpdated"';
+my $area_headers = '"RecordId","Code"';
+my $codes_headers = '"CombinedCode","Code","Description","OrderID"';
+
+$agency[0] =~ /$agency_headers/ or die "unknown database fields in $file->{agency}";
+print STDERR "'Agency' fields OK.\n";
+
+$area[0] =~ /$area_headers/ or die "unknown database fields in $file->{area}";
+print STDERR "'Area' fields OK.\n";
+
+$codes[0] =~ /$codes_headers/ or die "unknown database fields in $file->{codes}";
+print STDERR "'CODES' fields OK.\n\n";
+
+# check that files are UTF-8 not CP1252
+
+print STDERR "Checking that data is Unicode UTF-8...\n";
+
+my $pound_sign = "\x{00A3}";
+my $utf8_ok = "FALSE";
+
+for (@agency)
+{
+ $utf8_ok = "TRUE" if $_ =~ /$pound_sign/;
+}
+
+if ($utf8_ok eq "TRUE")
+{
+ print STDERR "'Agency' table contains UTF-8 characters, OK.\n\n";
+}
+
+# convert from CP1252 to UTF-8 if necessary
+
+if ($utf8_ok eq "FALSE")
+{
+ print STDERR "'Agency' table is not UTF-8. Converting from CP1252...\n";
+ my $tempname = tmpnam();
+ `iconv -f CP1252 -t utf-8 -o $tempname $file->{agency}`;
+ `cp $tempname $file->{agency}`;
+ print STDERR "'Agency' table converted, OK.\n\n";
+}
+
+1;
+
More information about the MKDoc-commit
mailing list