#!/usr/bin/perl -Tw use strict; use CGI; use LWP::UserAgent; use XML::DOM; use vars qw($DIR %total %count); $DIR = "/home/ljmarkov"; print "Content-type: text/html; charset=UTF-8\r\n"; print "Cache-control: no-cache\r\n"; print "Expires: 0\r\n"; print "\r\n"; my $q = new CGI; my $user = $q->param('user') || ''; my $words = $q->param('words') || 100; my $order = $q->param('order') || 2; $user =~ s/(^\s+|\s+$)//g; $words =~ s/(^\s+|\s+$)//g; $order =~ s/(^\s+|\s+$)//g; if ($words > 1000) { $words = 1000; } if ($order < 1) { $order = 1; } elsif ($order > 5) { $order = 5; } print < LJ Markov Random Text Generator

LJ Markov Random Text Generator

LJ User: (communities and syndicated feeds work too)
Words: (default 100, max 1000)
Order: (default 2, max 5, higher numbers mean less randomness)
(it's different every reload)

EOF if ($user && $user ne '') { open(F, ">>$DIR/log") or die; my $now = time; print F "$now $ENV{REMOTE_ADDR} $user $words $order\n"; close(F); doit($user, $words, $order); } print <[info]ghewgill <greg\@hewgill.com> EOF exit 0; sub doit { my ($user, $words, $order) = @_; if ($user !~ /^([a-z0-9_]+)$/i) { print "Error: that doesn't look like an lj user name\n"; return; } $user = $1; # untaint if ($words !~ /^\d+$/) { print "Error: expecting numeric input for Words\n"; return; } if ($order !~ /^\d+$/) { print "Error: expecting numeric input for Order\n"; return; } if ($order > $words) { print "Error: Order must be less than Words\n"; return; } my $fn = "$DIR/$user.rss"; my $baseurl = "http://$user.livejournal.com"; if ($user =~ /^_|_$/) { $baseurl = "http://users.livejournal.com/$user"; } elsif ($user =~ /_/) { my $urluser = $user; $urluser =~ s/_/-/g; $baseurl = "http://$urluser.livejournal.com"; } if (!-f $fn || -z $fn || -M $fn > 60/1440.0) { my $ua = new LWP::UserAgent; $ua->cookie_jar({}); my $r = $ua->post("http://www.livejournal.com/login.bml", {user => 'markov_bot', password => 'xxxxxxxxxxxx', expire => 'never', bindip => 'yes'}); if ($r->code != 200) { open(F, ">>$DIR/error") or die; my $now = time; print F "$now Error ".$r->code." logging in to livejournal.com (".$r->message."). \n"; close(F); } $r = $ua->get("$baseurl/data/rss", ':content_file' => "$fn.tmp", 'User-Agent' => "http://apps.hewgill.com/cgi-bin/ljmarkov.pl; greg\@hewgill.com"); if ($r->code != 200) { print $r->content; print "Error ".$r->code." from livejournal.com (".$r->message.").\n"; return; } rename "$fn.tmp", $fn; } my $parser = new XML::DOM::Parser; my $text; eval { $text = $parser->parsefile($fn); }; if ($@) { print "Error: could not parse rss data from livejournal.com (perhaps a misspelled user name?)\n"; open(F, ">>$DIR/error") or die; my $now = time; print F "$now Parse error for $user: $@"; close(F); return; } my @last; traverse($text->getDocumentElement, sub { local $_ = $_[0]; s/<.*?>/ /g; my @a = split; foreach (@a) { if (@last == $order) { my $ww = join ' ', @last; $total{$ww}++; $count{$ww}{$_}++; shift @last; } push @last, $_; } }); my @pairs = keys %total; my @w = split / /, $pairs[rand scalar @pairs]; print map "$_ ", @w[0..$#w-1]; for (1..$words-$order+1) { print "$w[$order-1] "; my $ww = join ' ', @w; my $p = $count{$ww}; my $n = rand $total{$ww}; my $found = 0; foreach (keys %$p) { if ($n < $$p{$_}) { shift @w; push @w, $_; $found = 1; last; } $n -= $$p{$_}; } if (!$found) { print "
(insufficient source text to continue)\n"; last; #die "$n $ww $total{$ww}" unless $found; } } print "
\n"; print "

The above text is generated according to the markov chain model of random text generation, using the public RSS feed of $user's LiveJournal account at $baseurl/data/rss as source input.

\n"; } sub traverse { my ($node, $f, $a) = @_; foreach my $n (@{$node->getChildNodes}) { if ($n->getNodeType == ELEMENT_NODE) { traverse($n, $f, $a || $n->getNodeName eq 'description'); } elsif ($a && $n->getNodeType == TEXT_NODE) { &$f($n->getNodeValue); } } }