#!/usr/bin/perl
require 5;
use strict; # Time-stamp: "2000-11-24 23:36:13 MST"
# no utf; # Uncomment under Perl 5.6 or later
=head1 NAME
rtf2xml -- dump an RTF document's structure as XML
=head1 SYNOPSIS
% cat foo.rtf
{\stylesheet{\fs20 \snext0 Normal;}}
% rtf2xml < foo.rtf
<_><_>Normal;
=head1 DESCRIPTION
This program is a filter that takes an RTF stream on STDIN and emits
its document tree structure as XML on STDOUT. It assumes that the
input is well-formed RTF.
=head1 COPYRIGHT
Copyright (c) 2000 Sean M. Burke. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 AUTHOR
Sean M. Burke C
=cut
binmode(STDIN);
die "$0 takes no command-line parameters. Provide input on STDIN.\n" if @ARGV;
# For a modular approach to RTF parsing, see RTF::Parser in CPAN,
# http://search.cpan.org/search?dist=RTF-Parser
my $open_count = 0;
while() {
while( # Iterate over tokens on each line
m<
\G(?:
([{}]) # \1
|
(?: \\
([a-z]+) # \2: keyword
(-?\d+)? # \3: number
\x20?
)
|
(?: \\
(?: (?: '([0-9a-fA-F]{2})) # \4: hex escape
| ([-_|~:*\cm\cj{}])) # \5: magic character
)
|
([\cm\cj]+) # \6
|
([<>&]) # \7
|
([^\\{}\cm\cj]+) # \8: unescaped character data (not [<>&])
)
>sgx) {
if( defined $1 ) {
if($1 eq '{') {
++$open_count; print '<_>';
} else {
exit if --$open_count < 0; print '';
}
} elsif( defined $2 ) {
#print "\n";
if(defined($3)) {
if($2 ne 'bin') {
print "<$2 _='$3'/>"; # always -+\d+, so needs no escaping
} else { # special case: the 'bin' word.
if($3 < 1) { # sanity
print "";
} elsif($3 < (length $_ - pos($_))) { # all here
my $b = substr($_,pos($_), $3);
pos($_) += $3; # skip over that stuff;
$b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/''.(ord($1)).';'/eg;
print "";
} else {
my $b = substr($_,pos($_), $3);
my $l = $3 - length $b;
$_ = ''; # clear buffer
$b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/''.(ord($1)).';'/eg;
print "";
}
}
} else {
print "<$2/>"; # always a-z, so needs no escaping
}
} elsif( defined $4 ) {
print "$4;";
} elsif( defined $5 ) {
# ([-_|~:*\cm\cj{}]) ) # \5: magic character of some sort
if($5 eq '*') { print '' } # S for Star.
elsif($5 eq "\cm"
or $5 eq "\cj") { print '' }
elsif($5 eq '-') { print '' }
elsif($5 eq '|') { print '' }
elsif($5 eq '_') { print '‑' } # nonbreaking hyphen
elsif($5 eq ':') { print '' }
elsif($5 eq '~') { print ' ' }
else { print $5 } # { or }
} elsif( defined $6 ) {
# no-op
} elsif(defined $7) {
if($7 eq '&') { print '&' }
elsif($7 eq '<') { print '<' }
else { print '>' }
} else {
print $8;
}
}
}