#!/usr/bin/perl # # This is a version of ljsm.pl (CVS revision 1.51) by # Alexander Nikolaev # # I have downloaded my entries using ljsm.pl. It took all # night; I received 17 Mb of files (Nov. 14, 2002). Too much. # # Therefore, ljsm.pl was modified so that # comments are not loaded. Also, the horrid HTML that LJ creates was stripped # down a bit. The date of the post is prepended to the filename, # to facilitate sorting of the files. # Finally, the line was added at the bottom of # every file. Enjoy. # # Modifications are marked M.V. All work was done under GPL. # # Privet # Misha Verbitsky # http://imperium.lenin.ru/~verbit # Author: Alexander Nikolaev # Descr: fetch all user's posts and/or memories and save them in HTML format =head1 SYNOPSYS perl ljsm-strip.pl [-r -m -a -O -I -x -u user:password -p proxyURL -d yyyy/mm[:yyyy/mm]] user1 user2 ... retrive messages for the given users and save them to the local directory -r = resume processing if there is already local file for the given post -m = retrieve memories, not posts -a = get memories AND posts -O = overwrite existing files (NOT recommended) -I = ignore network errors and continue fetching posts -x = rebuild index file and exit -u user:password = specify user/password pair for LJ login on the command prompt -p proxyURL = use proxyURL as a http proxy -d yyyy/mm[:yyyy/mm] = save posts back to the specified date or in the specified date range =head1 SETUP I've tested this script with Windows Me, ActiveState perl v. 5.6.0 with the following ppm modules installed: HTML-Parser 2.33, HTML-SimpleLinkExtor 0.71, libwww-perl 5.48 If you want to use UTF-8 conversion to local charset you'll also need to setup Unicode-MapUTF8. -------- How to setup Unicode-MapUTF8 for Windows ------------ 1. ppm install Unicode-Map 2. download, untar and ungzip Unicode-MapUTF8 3. install Unicode-MapUTF8 ignoring messages about missing modules 4. go to %PERL%\site\lib\Unicode and edit MapUTF8.pm: 4.1 comment out 'use Unicode::Map8' and 'use Jcode' 4.2. insert 'use Unicode::Map' line if it is not already there -------------------------------------------------------------- =head1 TODO - [x] proxy support - [-] usable pager - [x] better date range handling (command-line switch?), - [x] explicitly show "You must be logged in to view this protected entry" and "This journal is deleted." cases in the index file =head1 SUBROUTINES =cut use constant LOGIN => ''; # leave it empty if you don't want to login use constant PASSWORD => ''; use constant START_YEAR => 2001; # fetch data back to this year use constant CLEANUP_HTML => 1; # 0 - leave html as it is, 1 - remove sidebars etc use constant UTF8_DECODE => 1; # convert text to local charset use constant LOCAL_CHARSET => 'KOI8-R'; use constant DEBUG_LEVEL => 2; # 0 - quiet, 1 - essential, 2 - verbose use constant LOCAL_DIR => ''; # local directory to put files into. Leave it empty to put in the current directory. Slash (/, if not empty) in the end required. use constant SAVE_PICS => 1; # download userpics and standard icons (requires CLEANUP_HTML > 0) use constant HTTP_PROXY => ''; # set proxy URL if you use http proxy use constant CVSVERSION => '$Revision: 0.9 $'; # don't touch this # =================================================================== # end of public constants definition. no user-editable parts below this line # =================================================================== use constant BASE_URL => 'http://www.livejournal.com/'; use constant CATALOG_URL => BASE_URL . 'view/?type=month'; use constant LOGIN_SCRIPT => 'login.bml'; use constant POST_SCRIPT => 'talkread.bml'; use constant MEMO_SCRIPT => 'tools/memories.bml'; use constant HTML_FOOTER => "\n\n"; use constant HTML_HEADER => ' Journal entry '; use LWP::UserAgent; use HTTP::Cookies; use HTML::Form; use HTML::SimpleLinkExtor; use File::Path; use File::Basename; use File::Find; use Date::Manip qw(ParseDate); # To convert dates of postings - M. V. if (UTF8_DECODE) { use Unicode::MapUTF8 qw(from_utf8); } use Getopt::Std; #use Data::Dumper; use strict; my ($ua, $req, $res, @posts, %images, $user, %stat, %memories, %posts); # steal options from @ARGV before we go for users our ($opt_r, $opt_m, $opt_a, $opt_O, $opt_I, $opt_u, $opt_x, $opt_p, $opt_d); getopts('rmaxOIu:p:d:'); # sanity checks if (@ARGV == 0) { warn "usage: $0 [-r -m -a -O -I -x -u user:password -p proxyURL -d yyyy/mm[:yyyy/mm]] user1 user2 ...\n"; warn "-r = resume processing if there is already local file for the given post\n"; warn "-m = save memories instead of posts\n"; warn "-a = save memories AND posts\n"; warn "-O = overwrite existing files (NOT recommended)\n"; warn "-I = ignore network errors and continue fetching posts\n"; warn "-x = rebuild index file and exit\n"; warn "-u user:password = specify user:password pair for LJ login on the command prompt\n"; warn "-p proxyURL = use proxyURL as a http proxy\n"; warn "-d yyyy/mm[:yyyy/mm] = save posts back to the specified date or in the specified date range\n"; exit 64; } # rebuild indexes and exit if -x option is set if ($opt_x) { foreach $user (@ARGV) { # for each user logmsg("rebuilding index file for user $user...\n"); build_index($user); logmsg("done.\n"); } exit 0; } # init global vars $stat{$_} = 0 foreach ('users','pages_ok','got_posts','images'); %images = (); $ua = new LWP::UserAgent; $ua->cookie_jar(new HTTP::Cookies( file => "ljcookies.txt", autosave => 1) ); # set proxy URL for LWP requests $ua->proxy('http', HTTP_PROXY) if HTTP_PROXY; $ua->proxy('http', $opt_p) if $opt_p; # get cookies exit 1 unless (!(LOGIN || $opt_u) || lj_login()); # get posts and memories foreach $user (@ARGV) { # for each user %memories = %posts = (); $stat{'count_posts'} = $stat{'count_memos'} = 0; logmsg("\n\n=== processing user $user\n"); @posts = (); push @posts, get_memos($user) if ($opt_m || $opt_a); push @posts, get_posts($user) unless ($opt_m && !$opt_a); get_files($user); undef @posts; # free memory build_index($user); $stat{'users'}++; } # get images if (($stat{'got_posts'} > 0) && (scalar keys %images) && CLEANUP_HTML && SAVE_PICS) { get_pics(); } # ============================================ # subroutines # ============================================ =item get_date_range($user) get year and month of the last downloaded post =cut sub get_date_range { my ($user) = @_; my ($start_year, $start_month, $end_year, $end_month, @date, $t); @date = localtime(); # get end date if ($opt_d) { ($start_year, $start_month, $end_year, $end_month) = split(/\D/, $opt_d); $end_year = $date[5]+1900 unless $end_year; $end_month = $date[4]+1 unless $end_month; # swap dates if specified in reversed order if ($start_year > $end_year) { ($start_year, $end_year, $start_month, $end_month) = ($end_year, $start_year, $end_month, $start_month); } elsif (($start_year == $end_year) && ($start_month > $end_month)) { ($start_month, $end_month) = ($end_month, $start_month); } return ($start_year, $start_month, $end_year, $end_month); } else { $start_year = START_YEAR; $start_month = 1; $end_year = $date[5] + 1900; $end_month = $date[4] + 1; } # set start_year, start_month based on the downloaded posts if (!(-d LOCAL_DIR . $user) || $opt_O || $opt_r) { return ($start_year, $start_month, $end_year, $end_month); } opendir(UD, LOCAL_DIR . $user) or die "error opening " . LOCAL_DIR . "$user directory: $!\n"; my ($year) = sort {$b <=> $a } grep(/^\d+$/, readdir(UD)); close UD; return ($start_year, $start_month, $end_year, $end_month) unless $year; opendir(UD, LOCAL_DIR . "$user/$year") or die "error opening " . LOCAL_DIR . "$user/$year directory: $!\n"; my ($month) = sort {$b <=> $a } grep(/^\d+$/, readdir(UD)); close UD; $month = 1 unless $month; return ($year, $month, $end_year, $end_month); } =item get_pics() download userpics, buttons etc =cut sub get_pics { my ($imgsrc, $img, $target); logmsg("getting pictures...\n",2); foreach $imgsrc (keys %images) { # test if there is already image with the same name $imgsrc =~ s/"//g; $imgsrc =~ s/src=(@{[BASE_URL]})?\/?//; $target = LOCAL_DIR . $imgsrc; next if ($imgsrc =~ m{^http://}); next if (-f $target); # get image if ($img = get_page(BASE_URL . $imgsrc)) { mkpath(dirname($target), DEBUG_LEVEL, 0755); if (open (DF, ">$target")) { binmode DF; print DF $img; close DF; $stat{'images'}++; } else { logmsg("error opening $target for writing: $!\n",0); } } } } =item get_memos($user) get list of user's memories and store them is $posts{memos} =cut sub get_memos { my ($user) = @_; my($extor_kw, $extor_posts, $content, $amuser, $keyword); my (@memos, $link, $link_post); logmsg("getting list of memories...\n",2); # get list of keywords if ($content = get_page(BASE_URL . MEMO_SCRIPT . "?user=$user")) { $extor_kw = new HTML::SimpleLinkExtor(BASE_URL); $extor_kw->parse($content); foreach $link ($extor_kw->a) { # get list of keywords links next unless $link =~ /@{[MEMO_SCRIPT]}\?user=\w+\&keyword=([\w%\+]+)/; $keyword = $1; $keyword = " " unless length $keyword; # unescape keywords $keyword =~ s/\+/ /g; $keyword =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; $keyword = from_utf8({-string => $keyword, -charset => LOCAL_CHARSET}) if (UTF8_DECODE); # get list of posts for the given keyword if ($content = get_page($link)) { $extor_posts = new HTML::SimpleLinkExtor(BASE_URL); $extor_posts->parse($content); foreach $link_post ($extor_posts->a) { next unless $link_post =~ /@{[POST_SCRIPT]}\?(journal=(\w+)\&)?itemid=(\d+)/; next unless ($opt_O || ! -f LOCAL_DIR . "$user/memories/$2\_$3.html"); push @memos, { 'type' => 'memo', 'status' => 0, 'amuser' => $2, 'keyword' => $keyword, 'link' => $link_post->as_string, 'itemid' => $3 }; } } else { # error fetching list of posts logmsg("error fetching list of posts for user $user, keyword $keyword",0); } } } else { # error fetching list of keywords logmsg("error fetching list of keywords for $user\n",0); } return @memos; } =item get_posts($user) get list of user's posts and store them in $posts{posts} =cut sub get_posts { my ($user) = @_; my ($content, $year, $month, @posts, $extor, $link, $emonth); @posts = (); my ($start_year, $start_month, $end_year, $end_month) = get_date_range($user); logmsg("getting posts links for $user " . sprintf("[ %4d/%02d - %4d/%02d ]", $start_year,$start_month,$end_year,$end_month) . "\n"); $year = $end_year; YEAR: while ($year >= $start_year) { $emonth = ($year == $start_year)? $start_month : 1; for ($month = 12; $month >= $emonth; $month--) { next if (($year == $end_year) && ($month > $end_month)); #fetch catalog data if ($content = get_page(CATALOG_URL . "&user=$user&y=$year&m=$month")) { $extor = new HTML::SimpleLinkExtor(BASE_URL); $extor->parse($res->content); # process links. foreach $link (reverse sort $extor->a) { if ($link =~ /@{[POST_SCRIPT]}(.*?)itemid=([0-9]+)/) { last YEAR if (-f LOCAL_DIR . "$year/$month/$1.html" && !($opt_O || $opt_r)); next if $opt_r; my $itemid =$2; # Let's find date and rnum! - M.V. my $linkstring=$link->as_string; # Let's remove http://www.livejournal.com # from $linkstring. - M. V. $linkstring =~s/http:\/\/www.livejournal.com//; # Also, replace & with & $linkstring =~s/\&/\&\;/; my $rnum=0; if ($content =~ / 'post', 'status' => 0, 'year' => $year, 'month' => $month, 'link' => $link->as_string, 'itemid' => $itemid, 'date' => $date, 'replies_num' => $rnum # Date and replies_num are found by parsing # the catalogue entry - M.V. }; } } # link loop on the catalog page } else { # error fetching catalog data return @posts unless $opt_I; } } # months loop $year--; } # years loop return @posts; } =item get_files($user) download and process posts and memories. =cut sub get_files { my ($user) = @_; my ($post, $content, $dir, $fname, $result, $extor, $up, $navbar, $n); logmsg("getting posts...\n"); foreach $post (@posts) { if ($post->{'type'} eq 'post') { $dir = LOCAL_DIR . "$user/$post->{year}/$post->{month}"; $fname = "$post->{date}_$post->{itemid}.html"; $up = "../../.."; } else { # memo $dir = LOCAL_DIR . "$user/memories"; $fname = "$post->{amuser}_$post->{itemid}.html"; $up = "../.."; } if (-s "$dir/$fname") { ($opt_O)? logmsg("!! overwriting $dir/$fname\n", 2) : next; } if ($content = get_page($post->{'link'})) { $stat{'got_posts'}++; mkpath($dir, DEBUG_LEVEL, 0755); if (CLEANUP_HTML) { # print "Cleaning $post->{'link'}...\n"; my $result = ''; # $navbar =1; foreach (split(/\n/, $content)) { if (// .. (// || /<\/body>/ || /

.Post Comment<\/font>

/); next if (/^
$/); # skip the blue navigation table # (Didn't work because cellpadding/cellspacing # were other way around... this sucks - M.V.) # if ($navbar && ($n = /^$/ .. /^<\/table>$/)) {} # Let's just kill a7c7e8 - M.V. next if (/a7c7e8/ .. /^<\/table>$/); # Let's kill the form with "Post Comment" - M.V. s/

.\n

/s; # decode UTF8 $_ = from_utf8({-string => $_, -charset => LOCAL_CHARSET}) if (UTF8_DECODE); $result .= "$_\n"; map { $images{$_} = 1} grep(m{/(img|userpic)/}, split(/\s+/, $_)); last if (//); } } if (SAVE_PICS) { logmsg("processing pictures and relative links...\n", 4); $result =~ s/src="\/(userpic|img)\//src="$up\/$1\//sg; #why icons have absolute src? $result =~ s/src="@{[BASE_URL]}(img\/(.*?)\.gif)"/src="$up\/$1"/sg; # replace all relative links with absolute ones $result =~ s/{'keyword'}; $content .= "\n\n" if (defined ($post->{'keyword'})); # $content, -charset => LOCAL_CHARSET}); } logmsg(">> $dir/$fname\n",2); open DF,">$dir/$fname" or die "error opening $dir/$fname for writing: $!\n"; print DF $content; close DF; $post->{'status'} = 1; } else { # error fetching page last unless $opt_I; } } } =item lj_login() POST login form to server, get cookies =cut sub lj_login { my($searchform, $loginform, $content); logmsg("logging in to " . BASE_URL . "... \n", 1); my ($user, $password) = ((defined $opt_u) && (length $opt_u > 0))? split(":", $opt_u, 2) : (LOGIN, PASSWORD); # POST login form to get cookies if ($content = get_page(BASE_URL . LOGIN_SCRIPT)) { ($searchform, $loginform) = parse HTML::Form($content, BASE_URL); # login form found if (defined $loginform && $loginform->find_input('user')) { $loginform->value('user',$user); $loginform->value('password',$password); #submit login form $res = $ua->request($loginform->click); if ($res->is_success) { $content = $res->content; # invalid login/password if ($content =~ /Error<\/span>/) { logmsg("invalid login.\n", 0); return undef; } } else { logmsg("error submitting login form to $loginform->action\n", 100); logmsg($res->error_as_HTML . "\n", 0); return undef; } } else { # no login form logmsg("got login page " . BASE_URL . LOGIN_SCRIPT . " but found no login form on it.\n", 0); return undef; } logmsg("got cookies.\n", 1); return 1; } else { # couldn't get login form page return undef; } } =item get_page($url) download page from the remote host =cut sub get_page { my ($url) = @_; # Replacing talkread with talkpost to save bandwidth. - M.V. $url =~ s/talkread/talkpost/; logmsg("<< $url\n",2); $req = new HTTP::Request GET => $url; #send request $res = $ua->request($req); #process responce if ($res->is_success) { $stat{'pages_ok'}++; return wantarray? ($res->content, $res->content_type) : $res->content; } else { $stat{'pages_err'}++; logmsg("$url\:\n" . $res->error_as_HTML() . "\n", 0); return undef; } } sub logmsg { my ($message, $loglvl) = @_; if (!defined $loglvl) { print $message; } else { warn $message if ($loglvl <= DEBUG_LEVEL); } } =item build_index($user) build index file for the given user =cut sub build_index { my ($user) = @_; my ($month, $year, @months); @months = ('','January','February','March','April','May','June', 'July','August','September','October','November','December'); # skip to next dir if there is no such user unless (-d LOCAL_DIR . $user) { logmsg(LOCAL_DIR . $user . " not found."); return; } # traverse directory tree calling process_html for each file found find({ wanted => \&process_html_file, preprocess => \&sort_directory }, LOCAL_DIR . $user); # write index.html open DF, ">" . LOCAL_DIR . $user . "/index-ljsm.html" or die "error opening " . LOCAL_DIR . $user . "/index-ljsm.html" . "for writing: $!\n"; print DF < Index file for $user livejournal


$user's livejournal.   EOH print DF "$stat{count_memos} memories " if (scalar keys %memories); if (scalar keys %posts) { print DF " | $stat{count_posts} posts: "; foreach (sort keys %posts) { # foreach year print DF "$_ "; } print DF "\n"; } print DF "
last updated: " . (scalar localtime) . "\n"; print DF '
' . "\n"; if (scalar keys %posts) { foreach $year (reverse sort keys %posts) { # $posts{$year} is a reference to the hash of months # year header print DF "\n"; print DF '


' . "\n"; print DF '' . $year . ': '; print DF "" . $months[$_+0] . " | " foreach (sort {$a <=> $b} keys %{$posts{$year}}); print DF '

' . "\n"; # year body for $month (reverse sort {$a <=> $b} keys %{$posts{$year}}) { print DF "[ $months[$month] ]
\n"; print DF $posts{$year}->{$month} . "

\n"; } } } if (scalar keys %memories) { print DF ''. "\n"; print DF '


' . "\n"; print DF 'Memories: ' . "\n"; print DF '
' . "\n"; print DF "
\n
$_
\n" . $memories{$_} . "
\n" foreach (sort keys %memories); } print DF <

generated by ljsm-strip.pl @{[CVSVERSION]} EOE close DF or warn "Error closing file: $!\n"; } # sort filenames so that the most recent posts go first sub sort_directory { return sort {$b <=> $a} @_; } # callback subroutine for build_index # sub process_html_file { my ($line, $link, $kw, $title, $amuser, $itemid, $date); return unless ($File::Find::dir =~ m#(\d{4}/\d{1,2}|memories)#); return unless (-s && /\.html$/); # $_ is set to file name and we are inside target directory open DF, "<$_" or die "Error opening $File::Find::name for reading: $!\n"; # search for link, keywords, title and date while ($line = ) { $kw = $1 if ($line =~ //); $title = $1 if ($line =~ /(.*?)<\/b><\/i>/); $title = "$1" if ($line =~ /Error<\/span>
(.*)$/); $link = $1 if ($line =~ /talkpost.bml\?([\w=&%;]*?)">Post a new comment<\/a>\)<\/b>/); $date = $1 if ($line =~ m{href="@{[BASE_URL]}users/\w+/day/\d\d\d\d/\d\d/(\d{1,2})"}); last if ($link); } $date = sprintf("%02d. ", $date) if $date; $link =~ s/&/&/g if ($link); close DF or warn "Error closing $File::Find::name : $!\n"; $title = 'no title' unless ($title); if ($File::Find::dir =~ /memories/) { $stat{'count_memos'}++; # if we didn't get $link (empty html?) try to figure it from file's name if (!$link) { ($amuser,$itemid) = split(/_/,$_,2); $link = ($amuser)? "journal=$amuser&itemid=$itemid" : "itemid=$itemid"; } $kw = 'default' unless ($kw); $memories{$kw} .= "
$title   | »
\n"; } elsif ($File::Find::dir =~ m{(\d{4})/(\d{1,2})}) { $stat{'count_posts'}++; $posts{$1} = {$2 => ''} if (!defined $posts{$1}); $posts{$1}->{$2} .= "$date $title   | »
\n"; } else { # html file in unknown directory. just do nothing } } # print some statistics and kiss goodbye END { delete $stat{'count_posts'}; delete $stat{'count_memos'}; if ((DEBUG_LEVEL > 0) && (scalar keys %stat)) { print "\n\n================ s t a t i s t i c s ====================\n"; print "ljsm-strip.pl @{[CVSVERSION]}\n"; print "$stat{$_} $_ " foreach keys %stat; print "\n=========================================================\n"; } }