#!/bin/bash # program by Andreas Hadjiprocopis # livantes at soi.city.ac.uk # Institute of Cancer Research # London, 2010 # Free for everyone to copy, use, modify / quote author # based on R xlsx package by Adrian Dragulescu inputFile="" outputDir="" hash=0 colnames=",quote=FALSE" col_names_at_row="" row_range=() col_range_from="1" col_range_to="" separator="t" app_name="$0" if [ "$*" = "" ]; then display_help_exit $app_name fi app_name="$0" while getopts "i:o:h:r:ac:s:H" OPTION do case $OPTION in s) separator=$OPTARG ;; i) if [ "$inputFile" != "" ]; then echo "$app_name : you can specify only one input file" exit 1 fi inputFile="$OPTARG" ;; o) if [ "$outputDir" != "" ]; then echo "$app_name : you can specify only one output folder" exit 1 fi outputDir=$OPTARG ;; a) # add a hash mark comment to the first line (if you use perl/csh scripts) hash=1 ;; h) # quote only the first line (which is column names) if [ "$col_names_at_row" != "" ]; then echo "$app_name : you can use the -h option only once" exit 1 fi col_names_at_row=$OPTARG colnames=",quote=c(1),col.names=TRUE" ;; r) tmp1=`echo "$OPTARG" | cut -d':' -f1` tmp2=`echo "$OPTARG" | cut -d':' -f2` if [[ "$tmp1" = "" ]] && [[ "$tmp2" = "" ]]; then echo "$app_name : incorrect format for row range (use '-r from:to' where either 'from' or 'to' but not both can be empty meaning beginning for from and end for to)" exit 1 fi if [[ "$tmp1" = "" ]]; then tmp1="-1"; fi if [[ "$tmp2" = "" ]]; then tmp2="-1"; fi row_range+=("$tmp1" "$tmp2") ;; c) if [ "$col_range_to" != "" ]; then echo "$app_name : you can use the -c option only once" exit 1 fi col_range_from=`echo "$OPTARG" | cut -d':' -f1` col_range_to=`echo "$OPTARG" | cut -d':' -f2` if [["$col_range_to" = ""]] && [["$col_range_from" = ""]]; then echo "$app_name : incorrect format for column range (use '-c from:to')" exit 1 fi ;; H) display_help_exit "$app_name" ;; esac done if [ ! -f "$inputFile" ]; then echo "$0 : input file $inputFile does not exist." exit 1 fi if [ -e "$outputDir" ]; then if [ ! -d "$outputDir" ]; then echo "$0 : output directory $outputDir exists and is not a directory!" exit 1 fi else mkdir -p "$outputDir" fi R --vanilla << EOR require("xlsx") write_table_info <- function(message, rf, rt, cf, ct){ return (paste(message, " rows=", rf, ":", rt, ", cols=", cf, ":", ct, sep="")); } file = system.file("crap", "$inputFile", package="xlsx") wb = loadWorkbook("$inputFile") sheets = getSheets(wb) # we need to convert each string numeral into an integer using the as.numeric and that applies to columns/rows of matrix if( "${row_range[*]}" != "" ){ row_range = apply(matrix(do.call(rbind, strsplit("${row_range[*]}", " ")), ncol=2,byrow=TRUE), c(1,2), as.numeric); num_row_range = dim(row_range)[1]} else num_row_range = 0 if( "$col_range_to" == "" ){ col_range_to = -1 } else { col_range_to = $col_range_to+0 } col_range_from = $col_range_from for(sheet in sheets){ name = sheet\$getSheetName() rows = getRows(sheet) num_rows_in_sheet = 0 for(row in rows){ num_rows_in_sheet = num_rows_in_sheet + 1 } write_col_range_from = row\$getFirstCellNum() write_col_range_to = row\$getLastCellNum() if( "$col_names_at_row" != "" ){ col_names_at_row = $col_names_at_row } else { col_names_at_row = 0 } append = FALSE # check col range and if header, do it here outfile = paste("$outputDir", "/", name, ".txt", sep="") print(paste("$app_name : I am saving sheet:", name, " in '", outfile, "'", sep="")) values = NULL for(row in rows){ # row numbers start from 0 rn = row\$getRowNum() + 1 # this starts from 0 lc = row\$getLastCellNum(); if( col_range_to < 0 ) col_range_to = lc # this starts from 1 fc = row\$getFirstCellNum() + 1 if( col_names_at_row == rn ){ # if we have col names row, then this is is the definite num_cols guide if( fc >= col_range_from ) write_col_range_from = fc else write_col_range_from = col_range_from if( lc <= col_range_to ) write_col_range_to = lc else write_col_range_to = col_range_to if( "$hash" == "1" ){ print("$app_name : commenting the column names line (first) of output file"); write.table(c('#'), outfile, sep="", row.names=FALSE, eol="\t", col.names=FALSE, quote=FALSE) } print(write_table_info("$app_name : writing header at ", col_names_at_row, col_names_at_row, write_col_range_from, write_col_range_to)) tryCatch({values = getMatrixValues(sheet, col_names_at_row:col_names_at_row, write_col_range_from:write_col_range_to); write.table(values, outfile, sep="$separator", row.names=FALSE, eol="\n", col.names=FALSE, quote=TRUE)}, error=function(ex){write(paste("$app_name : error while writing header, row", rn, "of sheet", name, "of file $inputFile - can columns", write_col_range_from,":",write_col_range_to, "be empty?", ex), stderr())}) append = TRUE break } out_of_range = TRUE if( num_row_range > 0 ) for(i in 1:num_row_range) if( (((row_range[i,1]==-1)&&(rn>0))||((row_range[i,1]!=-1)&&(rn>=row_range[i,1]))) && (((row_range[i,2]==-1)&&(rn<=num_rows_in_sheet))||((row_range[i,2]!=-1)&&(rn<=row_range[i,2]))) ){ out_of_range = FALSE; break; } if( out_of_range == TRUE ) next if( write_col_range_from != fc ){ write(paste("Warning: in sheet",name,", row (",num_rows_in_sheet,") has different number of cells from row ",rn," (i.e. ",fc,"<>",write_col_range_from,")", sep=""), stderr()); if(write_col_range_from>fc) write_col_range_from = fc} if( write_col_range_to != lc ){ write(paste("Warning: in sheet",name,", last row (",num_rows_in_sheet,") has different number of cells from row ",rn," (i.e. ",lc,"<>",write_col_range_to,")", sep=""), stderr()); if(write_col_range_to>lc) write_col_range_to = lc} } for(row in rows){ # row numbers start from 0 rn = row\$getRowNum() + 1 if( col_names_at_row == rn ) next out_of_range = TRUE if( num_row_range > 0 ) for(i in 1:num_row_range) if( (((row_range[i,1]==-1)&&(rn>0))||((row_range[i,1]!=-1)&&(rn>=row_range[i,1]))) && (((row_range[i,2]==-1)&&(rn<=num_rows_in_sheet))||((row_range[i,2]!=-1)&&(rn<=row_range[i,2]))) ){ out_of_range = FALSE; break; } if( out_of_range == TRUE ) next tryCatch({values = getMatrixValues(sheet, rn:rn, write_col_range_from:write_col_range_to);write.table(values, outfile, sep="$separator", row.names=FALSE, eol="\n", col.names=FALSE, append=append)}, error=function(ex){write(paste("$app_name : error while writing row", rn, "of sheet", name, "of file $inputFile - - can columns", write_col_range_from,":",write_col_range_to, "be empty?\n", ex), stderr())}) append = TRUE } print(paste("$app_name : wrote ", rn, " rows", sep="")) } print("$app_name : done") EOR exit 0 function display_help_exit { echo "Usage : $1 -i excel_input_file -o outputDir [-r from_row:to_row] [-h row_with_column_names] [-a]" echo "This script will read an excel (xlsx) file and output each sheet onto" echo "a separate file in an output folder" echo " -i excel_input_file" echo " defines the excel input file." echo " -o outputDir" echo " Output folder name, all input's sheets will be saved in" echo " this folder. Hopefully, the sheet's name will not mess up" echo " with the file system (e.g question marks, stars etc" echo " -r from_row:to_row" echo " optional, specify a range of rows to save, rows start from 1." echo " if you leave from_row empty, then it is assumed to be 1" echo " if you leave to_row empty, then it is assumed to be the last non-empty row of the sheet" echo " More than one row ranges may be specified by repeated '-r' options" echo " e.g. -r 3:8 -r 10:30 -r 76:99 -r :12 -r 12:" echo " -c from_col:to_col" echo " optional, specify a range of columns to save, columns start from 1." echo " -h row_with_column_names" echo " optional, specify the row which holds the column names." echo " This row will be the first row in the output file and the contents of each" echo " cell (i.e the column name) will be quoted in double quotes" echo " -a" echo " optional, comment the first row if '-h row' option was specified" echo " if you will use the file in programs like gnuplot. R does not need this." echo " Example:" echo " -s" echo " optional, use this character(s)/string as separator between columns" echo "" echo " Example:" echo "$app_name -i 'a.xlsx' -o 'IOI' -h 4 -r 1:12 -r 20:40" echo "read from a.xlsx and write to output folder IOI (which will be created if it does not exist)" echo "The column names are in row 4" echo "The data rows to save are in columns 1 to 12 (and 4 will be excluded)" echo "because it is printed once as the first row of the output file" echo "So in essence, you will have in each output file (representing each sheet)" echo "in the xlsx file rows: 4, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11 and 12" echo "If row 4 (the column names) contains less columns that the other rows 1:12, then" echo "extra rows WILL BE IGNORED - e.g. if row 4 contains 3 columns and row 5 contains 6," echo "then when printing row 5, only columns 1, 2 and 3 will be printed." echo "In the above example the '-a' option will add a hash mark (#) at the beginning" echo "of the first row - the column names so you can use it with Gnuplot." echo "" echo "program by Andreas Hadjiprocopis" echo "livantes at soi.city.ac.uk" echo "Institute of Cancer Research" echo "London, 2010" echo "Free for everyone to copy, use, modify / quote author" echo "(based on R xlsx package by Adrian Dragulescu)" echo "" exit 1 }