#!/bin/bash
#<pre><b>
# Script     : pstolp
# Version    : 1.0
# Author     : Roland Rashleigh-Berry
# Date       : 02-Aug-2005
# Purpose    : Production script to convert a postscript document to plain text
# SubScripts : none
# Notes      : This does the opposite of "lptops" hence its name is made up of
#              the parts of "lptops" going backwards. YOU SHOULD ONLY USE THIS
#              SCRIPT ON POSTSCRIPT FILES CREATED BY "LPTOPS". This script is 
#              used to ensure no corruption of characters was caused by "lptops"
#              processing by comparing the text stripped out of the postscript
#              file with the orginal text that went it.
#
#              This code was written for gawk version 3.1.4 and earlier 3.0.x
#              versions of gawk might interpret the escape sequences in the
#              gsub() calls differently and so cause a syntax error. This can
#              be fixed by changing the number of escape characters until it
#              works correctly for your version of gawk.
#
# Usage      : pstolp big.ps > big.txt
#              
#===============================================================================
# PARAMETERS:
#-pos- -------------------------------description-------------------------------
#  1   "lptops" postscript file to convert back to text
#===============================================================================
# AMENDMENT HISTORY:
# init --date-- mod-id ----------------------description------------------------
# 
#===============================================================================
# This is public domain software. No guarantee as to suitability or accuracy is
# given or implied. User uses this code entirely at their own risk.
#===============================================================================

# One postscript file only allowed
if [ $# -ne 1 ] ; then
  echo "Usage: ps2text file.ps > file.txt" 1>&2
  exit 1
fi


gawk '

#######################  DEFINE FUNCTIONS  #######################

##### function to convert an octal string to a number #####
function oct2dec(str,        ret, n, i, k, c)
{
      n = length(str)
        ret = 0
        for (i = 2; i <= n; i++) {
            c = substr(str, i, 1)
            if ((k = index("01234567", c)) > 0)
                k-- # adjust for 1-basing in awk

            ret = ret * 8 + k
        }
    return ret
}


##### function to convert all octal strings to character #####
function convoctal(str,        newstr,num,octstr)
{
  if (match(str,/(^|[^\\])\\(\\\\)*[0-7][0-7][0-7]/))
    {
     newstr=str
     while (match(newstr,/(^|[^\\])\\(\\\\)*[0-7][0-7][0-7]/))
       {
        octstr=substr(newstr,RSTART+RLENGTH-4,4)
        num=oct2dec(octstr)
        newstr=substr(newstr,1,RSTART+RLENGTH-5) sprintf("%c",num)
substr(newstr,RSTART+RLENGTH)
       }
     return newstr
    }
  else
    return str
}


###### funtion to fix a completed line #####
function fixline(str,         newstr)
{
  newstr=convoctal(str)
  # Note that the number of escape characters
  # required below varies with versions of gawk
  gsub("\\\\\\(","(",newstr)
  gsub("\\\\\\)",")",newstr)
  gsub("\\\\\\\\","\\",newstr)
  return newstr
}


#####################  INITIALISATION BLOCK  #####################
BEGIN { 
        # Initialise some variables
        pagecount=0 
        str1=""
        str2=""
        str3=""
        str4=""
        str5=""
        str6=""
        str7=""
}


####################  MAIN BLOCK STARTS HERE  ####################
{

# We are near the start of a page when the line starts with "%%Page:"
if ($0 ~ /^%%Page:/)
  {
   # Increment page counter
   pagecount++
   
   # Keep reading lines until page start is indicated with "FN"
   while ($0 !~ /^FN$/)
     {
      getline
     }
     
   # Throw away the "FN" line so we are truly at the start of the page
   getline
   
   # Print a page throw for all pages after the first
   if (pagecount>1)
     printf("%c",12)
   
   # Keep doing until the page is finished with "CE"
   while ($0 !~ /^CE$/)
     {
      # End of line indicated by ")P " at end or short line so write output line
      if (substr($0,length($0)-2,3)==")P " || length($0) < 4)
        {
         if (length(str1) > 0)
           {
           if (length($0) > 3)
             strend=substr($0,1,length($0)-3)
           else
             strend=""
           }
         else
           {
           if (length($0) > 4)
             strend=substr($0,2,length($0)-4)
           else
             strend=""
           }

         # Bring the pieces of the line together           
         fulline=str1 str2 str3 str4 str5 strend

         # Print the fixed line out or null if nothing
         if (length(fulline) == 0)
           print ""
         else 
           print fixline(fulline)

         # line now printed so initialise str variables
         str1=""
         str2=""
         str3=""
         str4=""
         str5=""
         str6=""
         str7=""
         
        }  #--- End of if line ends with ")P "
      else
        if (length(str1)==0)
          str1=substr($0,2,length($0)-2)
        else
          {
           if (length(str2)==0)
             str2=substr($0,1,length($0)-1)
           else
             {
              if (length(str3)==0)
                str3=substr($0,1,length($0)-1)
              else
                {
                 if (length(str4)==0)
                   str4=substr($0,1,length($0)-1)
                 else
                   {
                    if (length(str5)==0)
                      str5=substr($0,1,length($0)-1)
                    else
                      {
                       if (length(str6)==0)
                         str6=substr($0,1,length($0)-1)
                       else
                         {
                         if (length(str7)==0)
                           str7=substr($0,1,length($0)-1)
                         }
                      }
                   }
                }
             }
          }
                     
      getline
       
     }  #--- End of while not "CE"
  }  #--- End of if it starts with "%%Page:"

}' "$1"
