/[webpac]/openisis/current/any2txt
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /openisis/current/any2txt

Parent Directory Parent Directory | Revision Log Revision Log


Revision 237 - (hide annotations)
Mon Mar 8 17:43:12 2004 UTC (20 years ago) by dpavlin
File size: 3425 byte(s)
initial import of openisis 0.9.0 vendor drop

1 dpavlin 237 #!/bin/bash
2     #/*
3     # openisis - an open implementation of the CDS/ISIS database
4     # Version 0.8.x (patchlevel see file Version)
5     # Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org
6     #
7     # This library is free software; you can redistribute it and/or
8     # modify it under the terms of the GNU Lesser General Public
9     # License as published by the Free Software Foundation; either
10     # version 2.1 of the License, or (at your option) any later version.
11     #
12     # This library is distributed in the hope that it will be useful,
13     # but WITHOUT ANY WARRANTY; without even the implied warranty of
14     # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15     # Lesser General Public License for more details.
16     #
17     # You should have received a copy of the GNU Lesser General Public
18     # License along with this library; if not, write to the Free Software
19     # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20     #
21     # see README for more information
22     #EOH */
23     #
24     #
25     CVSID='$Id: any2txt,v 1.4 2003/04/08 00:20:52 kripke Exp $'
26     #
27     # utility to convert several document types to text
28     # used for mail archiving
29     # similar to anytopnm
30     #
31    
32     # assumptions:
33     # filename is a name of the file in the current directory
34     # we are allowed to use any filename.*
35     #
36    
37     # return code:
38     # the lowest valued bit denotes that there is a plain version in filename.txt
39     #
40     # 0 file was fine text and is unmodified
41     # 1 file was sort of (non-iso,long-lined) text, plain version is filename.txt
42     # 2 file was formatted html text
43     # 3 file was formatted html text, plain version is filename.txt
44     # 4,5 likewise for pdf
45     # 6,7 likewise for rtf
46     # 8,9 likewise for doc (M$-w0rd)
47     # 16,17 file was mail text
48     # 32 gif image
49     # 34 jpg image
50     # 36 bmp image
51     # 38 png image
52     # 40 tiff image
53     # 46 other image
54     # ?? other
55     # 124 unknown
56     # 126 error
57     #
58    
59     verbose=false
60    
61     err () {
62     echo "$*" >&2
63     exit 126
64     }
65    
66     dewindoofy () {
67     tr '\200-\237' 'E?,f".+#^%S<Ö?Z??'\'\''""*--"Ts>ö?zY'
68     }
69    
70     echo "any2txt $* ($PWD)" >&2
71    
72     while [ 1 -lt $# ]; do # shift options
73     case "$1" in
74     -v) verbose=true;;
75     -m) mtype="$2"; shift;;
76     esac
77     shift
78     done
79    
80     # if $verbose; then
81     # :
82     #else
83     # no output expected; so any output is error
84     exec >&2
85     #fi
86    
87     filename="$1"
88     [ -f "$filename" -a -r "$filename" ] || err "'$filename' not readable"
89    
90     ftype="$(file -b "$filename")"
91    
92     code=124
93     case "$ftype" in
94     *"mail text"*)
95     sed -e '1,/^$/d' <"$filename" >"$filename".txt
96     code=17
97     ;;
98     *HTML*) # maybe ASCII or ISO-8859 HTML document text, so check before ASCII
99     code=2
100     lynx -dump -force_html -nolog "$filename" >"$filename".txt && code=3
101     ;;
102     *"Non-ISO extended-ASCII"*"long lines") # assume windows stuff
103     dewindoofy <"$filename" | fmt -s >"$filename".txt
104     code=1
105     ;;
106     *"Non-ISO extended-ASCII"*) # assume windows stuff
107     dewindoofy <"$filename" >"$filename".txt
108     code=1
109     ;;
110     ASCII*text*"long lines"|ISO-8859*text*"long lines")
111     fmt -s <"$filename" >"$filename".txt
112     code=1
113     ;;
114     ASCII*text*|ISO-8859*text*)
115     code=0
116     ;;
117     PDF*)
118     code=4
119     pdftotext -raw "$filename" - >"$filename".txt && code=5
120     ;;
121     "Rich Text Format"*)
122     code=6
123     unrtf --nopict --html "$filename" | lynx -dump -force_html -nolist /proc/self/fd/0 >"$filename".txt && code=7
124     ;;
125     "Microsoft Office Document"*)
126     code=8
127     antiword "$filename" >"$filename".txt && code=9
128     ;;
129     *"GIF image"*) code=32;;
130     *"JPEG image"*) code=34;;
131     *"PC bitmap"*) code=36;;
132     *"PNG image"*) code=38;;
133     *"TIFF image"*) code=40;;
134     *image*) code=46;;
135     esac
136    
137     $verbose && echo "$filename: $ftype: $code" >&2
138    
139     exit $code

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26