#!/usr/bin/perl -w
# $Id: /xmltwig/trunk/tools/ooo2txt/ooo2txt-006 4 2007-03-16T12:16:25.259192Z mrodrigu  $
use strict;
use XML::Twig;

# -----------------------------------------------------------------
# Autor          : F. Labbe
# privat         : fred@frederic-labbe.com
# professionnal  : frederic.labbe@ch-avranches-granville.fr
#                : http://ooo2txt.fr.st/
#                  Modified by mirod <mirod@xmltwig.com>
# object         : convert OpenOffice.org file into ascii
# Test           : 
# usage (source) : perl ooo2txt [-on] [-e encoding] [-f field_name] ooo_file 
# usage (binary) :      ooo2txt [-j] [-f field_name] ooo_file 
#
# version        : 25/09/2002

my $ooo2txt_version = "0.0.6";

use strict;

use Getopt::Std;
use XML::Twig;
use Archive::Zip qw(:ERROR_CODES);
use File::Temp qw/ tempfile/;

my $USAGE= "USAGE: $0 [[-f <field_name>|all] | [-o] | [-n]] [-e <encoding>] <ooo_file> 
see http://ooo2txt.fr.st/";

my $OOO_XML_CONTENT= 'content.xml';

my %opt;

getopts('onvhe:f:t:', \%opt);

die "$0 version $ooo2txt_version\n" if( $opt{v});
die $USAGE, "\n" if( $opt{h});
die "options -o and -f are exclusive\n", $USAGE, "\n"
    if( (defined( $opt{o}) + defined( $opt{f}) ) > 1);

my $zip_name = shift(@ARGV) || die $USAGE;

my $zip  = Archive::Zip->new( $zip_name)       or  die "cannot read archive file $zip_name\n";
my $file = $zip->memberNamed($OOO_XML_CONTENT) or  die "Can't access data file $OOO_XML_CONTENT in zip.\n";
my $xml  = tempfile();
my $status= $file->extractToFileHandle($xml)   and die "Extracting $OOO_XML_CONTENT from $zip_name failed\n";
seek( $xml, 0, 0);

my %option;

my $state={}; # various state information used during parsing;

$option{output_encoding}= $opt{e} if( $opt{e});
my $conv= $opt{e} ? XML::Twig::encoding_filter( $opt{e}) : sub { return join '', @_; };

$opt{t}||= 30;
$opt{d}||= '.';

my $t;

if( $opt{f})
  {  # output only field(s)
     $t= XML::Twig->new( %option, twig_roots => { 'text:p' => sub { display_fields( $opt{f}, @_); }, },); 
  }
elsif( $opt{o})
  { # output only outline
    $t= XML::Twig->new( %option, twig_roots => { 'text:h' => \&h, },);
  }
else
  { # output all text
    $t= XML::Twig->new( %option,
                        twig_roots => { 'text:h' => \&h,
                                        'text:p' => sub { print $conv->( $_->text), "\n"; },
                                      },
                  );
  }
  
$t->parse( $xml);  

sub h
  { my( $t, $h)= @_;
    my $text= $h->text;
    if( $opt{n} || $opt{o})
      { my $text_level= $h->att( "text:level");
        if( $text_level)
          { print "\n";
	    my $number= current_number( $text_level, $state);
            $text= $number . $text;
          }
      } 
    print $conv->( $text), "\n"; 
  }

sub current_number
  { my( $text_level, $state)= @_;
    $state->{text_numbering}||= [];
    my $nb= $state->{text_numbering};
    foreach ( $text_level..@$nb) { pop @$nb; }
    $nb->[$text_level-1]++;
    return join( '.', @$nb) . " "; 
  }
    
sub display_fields
  { my( $fields, $t, $p)= @_;
    my $filter= $fields eq 'all' ?  qq{text:text-input}
                                 :  qq{text:text-input[\@text:description="$fields"]};
    while( my $field= $p->first_child( $filter))
      { my @children= $p->children;
	my $child= shift @children;
	my $text='';
	while( $child->before( $field))
	  { $text.= $child->text;
	    $child->cut;
	    $child= shift @children;
	  }
	$field->cut;
        display_field_line( $text, $field->text); 
      }
  }

sub display_field_line
  { my( $text, $field)= @_;
    $text  = $conv->($text);
    $field = $conv->($field);

    my $nb_dots= $opt{t} - length( $text);
    $nb_dots=0 if( $nb_dots < 0);
    print $text, " ", $opt{d} x $nb_dots, " ", $field, "\n";
  } 

__END__

=head1 NAME

  ooo2txt

=head1 DESCRIPTION

read a Star/Open Office file (only Writer is supported at the moment, C<.sxw>/C<.stw> files)
and display the text.
  

=head1 SYNOPSYS

  ooo2txt doc.sxw                 # output text (in utf-8) for the document
  ooo2txt -e ISO-8859-15 doc.sxw  # output ISO-8859-15 encoded text for the doc
  0002txt -n doc.sxw              # output text, titles are numbered
  0002txt -e NUM_CUST doc.sxw     # output field NUM_CUST for the doc
  ooo2txt -f all doc.sxw          # output all fields for the doc
  ooo2txt -o doc.sxw              # output outline (titles are numbered)

=head1 OPTIONS

=over 4

=item *

C<< -e [encoding] >> output encoding for the text (as per C<iconv>)

=item *

C<-n> number titles

=item *

C<-o> output only an outline of the doc (titles are numbered)

=item *

C<-f [all|field_name]> output all or a single field from the document

=back

=head1 TODO

  format tables properly
  get the style information to properly number titles

=head1 BUGS

  tables are not displayed properly
  numbering is very crude

=head1 PREREQUISITE

Archive::Zip

XML::Twig

XML::Parser

To use the C<-e> option a way to convert encodings is needed (Text::Iconv and Iconv, Encode or Unicode::Strings and Unicode::Map8)


=head1 AUTHOR

Michel Rodriguez <mirod@xmltwig.com>
based on work by F. Labbe <fred@frederic-labbe.com> 
                          <frederic.labbe@ch-avranches-granville.fr>

=head1 LICENSE

This library is free software; you can redistribute it and/or modify it under 
the same terms as Perl itself.

Comments can be sent to mirod@xmltwig.com

=head1 SEE ALSO

OpenOffice.org: http://www.openoffice.org/
XML::Twig: http://www.xmltwig.com
Ooo2txt: http://ooo2txt.fr.st/
