# --------------------------------------------------------------- # Program 'cleancsv' # # This program will read comma-separated text files exported from # Microsoft Excel, and produce corresponding text files that are # more easy to read by Fortran programs. # # Input files have file extension *.CSV. Output files have # extension *.TXT and are written into the same directory as the # corresponding input files. # # A list of input files to process is read from a master data # file named 'cleancsv.txt'. This is a text file that contains # full or partial path names of input files, one on each line. # # Author and contact: # # Petr Kuzmic, Ph.D. # BioKin Ltd. - Software & Consulting # # pkuzmic@biokin.com # http://www.biokin.com # # 04/02/2002 added the line: $csv::input_file_extension = "CSV"; # to force the script to read capital .CSV extensions as well # as lower case - Tammy W. Beaty, taw@ornl.gov # --------------------------------------------------------------- use strict; package csv; # ---------------------- # Define some constants. # ---------------------- $csv::input_file_list = "cleancsv.txt"; $csv::input_file_extension = "csv"; $csv::input_file_extension = "CSV"; $csv::output_file_extension = "txt"; $csv::empty_cell = " "; # change to "" if desired # --------------------------------- # Get the list of files to process. # --------------------------------- open (LIST, $csv::input_file_list) or die "Could not open file $csv::input_file_list!\n"; @csv::list = ; close (LIST); # ------------------ # Process the files. # ------------------ foreach $csv::input_file (@csv::list) { chomp ($csv::input_file); process_file ($csv::input_file); } # --------- # All done. # --------- print "\n\nAll done. Press Enter key to exit program."; ; exit (); # --------------------------------------------------------------- # Process a single input file. # --------------------------------------------------------------- sub process_file { # ------------------------ # Get the input file name. # ------------------------ my ($input_file) = @_; my ($status, $output_file); my (@input, $line, $number_of_lines, @fields, $field, $length, $counter); # --------------------- # Try to open the file. # --------------------- $status = open (IN, $input_file); if ($status != 1) { print "ERROR: The file $input_file could not be found!\n"; return; } print "processing $input_file...\n"; # ------------------------ # Create output file name. # ------------------------ $input_file =~ m|(.+)$csv::input_file_extension$|; $output_file = "$1$csv::output_file_extension"; # --------------------------------- # Open the output file for writing. # --------------------------------- $status = open (OUT, ">$output_file"); if ($status != 1) { print "ERROR: The file $output_file could not be opened!\n"; return; } # ---------------------------------- # Process input file line by line. # ---------------------------------- $number_of_lines = 0; foreach $line () { chomp ($line); $number_of_lines++; @fields = split_line ($line); mark_fields_as_quoted (\@fields); $length = @fields; $counter = 0; foreach $field (@fields) { if ($field eq "") { $field = $csv::empty_cell; } if ($csv::quoted[$counter]) { print OUT "\"$field\""; } else { print OUT $field; } if ($counter++ == $length-1) { print OUT "\n"; } else { print OUT ","; } } } close (OUT); print "written $number_of_lines lines.\n\n"; } # --------------------------------------------------------------- # Split a comma-separated text line into fields. # Some fields may contain commas, in which case they are enclosed # in double quotes ("). # --------------------------------------------------------------- sub split_line { my ($line) = @_; my (@fields, $inside_quoted_field, $field, $length, $i, $char); undef @fields; $inside_quoted_field = 0; $field = ""; $length = length ($line); for ($i = 0; $i < $length; $i++) { $char = substr ($line, $i, 1); if ($char eq '"') { if (! $inside_quoted_field) { $inside_quoted_field = 1; } else { $inside_quoted_field = 0; } } elsif (($char eq ',') && ! $inside_quoted_field) { push @fields, $field; $field = ""; } else { $field .= $char; } } push @fields, $field; return @fields } # --------------------------------------------------------------- # Mark the fields that should be enclosed in double quotes. # --------------------------------------------------------------- sub mark_fields_as_quoted { my ($input) = @_; my (@fields, $number_of_fields, $numerical_value, $i); @fields = @$input; $number_of_fields = @fields; undef @csv::quoted; # -------------------------------- # First mark all fields as quoted. # -------------------------------- for ($i = 0; $i < $number_of_fields; $i++) { $csv::quoted[$i] = 1; } # -------------------------------------------------- # Now un-mark all numerical fields. # -------------------------------------------------- for ($i = 0; $i < $number_of_fields; $i++) { if (is_a_number ($fields[$i])) { $csv::quoted[$i] = 0; } } } # --------------------------------------------------------------- # Determine if a given text represents a number. # --------------------------------------------------------------- sub is_a_number { my ($input) = @_; my ($is_a_number); # -------------------------------------- # Assume that the input is not a number. # -------------------------------------- $is_a_number = 0; # -------------------------------------------- # Now check if the input actually is a number. # -------------------------------------------- if ($input =~ m|^\-?\d+$|) { # ---------------------------------------------------- # The input matches an integer. # Example: "1", "23", "123", "-1", "-23", "-123", etc. # ---------------------------------------------------- $is_a_number = 1; } elsif ($input =~ m|^\-?\d+\.\d+$|) { # -------------------------------------------------------------- # The input matches a real number in the simple notation # Example: "1.12", "23.4", "-1.12", "-23.4", etc. # Note: ".123", meaning 0.123, is NOT accepted by this filter. # A number must not begin with a period, or a negative sign # followed by a period. # -------------------------------------------------------------- $is_a_number = 1; } elsif ($input =~ m|^\-?\d+\.\d+[eE]\-?\d+$|) { # -------------------------------------------------------------- # The input matches a real number in the "scientific" notation # Example: "1.12e02", "23.4E001", "-1.12e-1", "-23.4E-001", etc. # Note: ".123e04", meaning 123.0, is NOT accepted by this filter. # A number must not begin with a period, or a negative sign # followed by a period. # -------------------------------------------------------------- $is_a_number = 1; } return $is_a_number; }