#!/usr/bin/perl

# Copyright (C) 2007 王亮 <netcasper AT gmail DOT com>

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public Licence as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.

# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public Licence for more details.

use strict;
use warnings;

use YAML::Syck;
use HTML::Template;
use DateTime;
use DateTime::Format::W3CDTF;
use XML::Feed;
use LWP::UserAgent;
use URI;
use File::Path;
use HTTP::Status;
use Encode;
use Getopt::Long;
use Digest::SHA;
use File::HomeDir;
use File::Spec::Functions;
use IO::File;
use Scalar::Util qw(blessed);

my $output = "subscription.html";
my $parse_only;
my $remove_old_entries;
my $max_feed_num;
my $feed_file;
my $feed_url;

my $result = GetOptions (
    "output=s" => \$output,
    "parse-only" => \$parse_only,
    "remove-old-entries" => \$remove_old_entries,
    "max-feed-num|f=i" => \$max_feed_num,
    "feed-file=s" => \$feed_file,
    "feed-url=s"  => \$feed_url,
);

die "Usage: perl print.pl [--output filename] [--parse-only] [--remove-old-entries] [[--max-feed-num=NUM]|[-f NUM]]\n" unless $result;

my $feedman_home = catdir(File::HomeDir->my_home, ".feedman");

my $templatefile = catfile($feedman_home, "feedman.tmpl");
my $cachefile    = catfile($feedman_home, "feed.yaml");
my $datefile     = catfile($feedman_home, "last_update_date.yaml");
my $entryfile    = catfile($feedman_home, "last_update_entries.yaml");
my $getnewfile   = catfile($feedman_home, "getnew.yaml");

my %subscriptions;
my %last_update_date;
my %last_update_entries;
my %always_get_new;

my $now = DateTime->now(time_zone => 'Asia/Shanghai');
my $total_new_entries = 0;

check_and_load();

main() unless caller();

sub main {
    if (defined $feed_file || defined $feed_url) {
        my $filename;
        my $last_update;
        if (defined $feed_file) {
            $filename = $feed_file;
        }
        else {
            my $s = get_subscription($feed_url);
            $filename = catfile(make_directory($s->{xmlUrl}), "feed");
            $last_update = get_last_update_date($s->{htmlUrl});
        }
        $filename = get_correct_feed_filename($filename);
        my $r = get_latest_update_date($filename, $last_update);
        if (defined $r) {
            my $date = $r->{date};
            my $str = DateTime::Format::W3CDTF->format_datetime($date);
            print "latest update: $str\n";
        }
        else {
            die "Date not found.\n";
        }
    }
    else {
        my @htmlUrl = keys %subscriptions;
        my $items = list_items(\@htmlUrl);
        my $sorted_items = sort_items_by_date($items);
        generate_html_from_template(scalar @htmlUrl, $sorted_items);
        if (!(defined $parse_only) && !(defined $max_feed_num)) {
            &save_updated_info;
        }
    }
}

sub check_and_load {
    local $YAML::Syck::ImplicitUnicode = 1;

    die "$templatefile not found.\n"
        unless -e $templatefile and -f _;

    if (-e $cachefile and -f _) {
        my $data = LoadFile($cachefile);
        if (defined $max_feed_num) {
            if ($max_feed_num > 0) {
                for my $k (keys %$data) {
                    $subscriptions{$k} = $data->{$k};
                    last if --$max_feed_num == 0;
                }
            }
            else {
                warn "invalid value($max_feed_num) from option --max-feed-num, load all feeds.\n";
            }
        }
        else {
            %subscriptions = %$data;
        }
    }
    else {
        die "$cachefile not found.\n";
    }

    if (-e $datefile and -f _) {
        my $data = LoadFile($datefile);
        %last_update_date = %$data;
        for my $key (keys %$data) {
            parse_last_update_date($key);
        }
    }
    else {
        warn "$datefile not found.\n";
    }

    if (-e $entryfile and -f _) {
        my $data = LoadFile($entryfile);
        %last_update_entries = %$data;
    }
    else {
        warn "$entryfile not found.\n";
    }

    if (-e $getnewfile and -f _) {
        my $date = LoadFile($getnewfile);
        %always_get_new = %$date;
    }
    else {
        warn "$getnewfile not found.\n";
    }
}

sub get_last_update_date {
    my $url = shift;
    return unless exists $last_update_date{$url};

    return $last_update_date{$url};
}

sub set_last_update_date {
    my $url = shift;
    $last_update_date{$url} = shift;
}

sub remove_last_update_date {
    my $url = shift;
    delete $last_update_date{$url};
}

sub format_last_update_date {
    my $url = shift;
    my $d = get_last_update_date($url);
    set_last_update_date($url, DateTime::Format::W3CDTF->format_datetime($d));
}

sub parse_last_update_date {
    my $url = shift;
    my $d = get_last_update_date($url);
    set_last_update_date($url, DateTime::Format::W3CDTF->parse_datetime($d));
}

sub get_correct_feed_filename {
    my $filename = shift;
    if ($filename =~ m/blog\.sina\.com\.cn/) {
        my $newfile = "$filename.new";
        warn "$newfile not found.\n" unless -e $newfile and -f _;
        return $newfile;
    }
    else {
        return $filename;
    }
}

# mirror still works since original file doesn't change.
sub correct_feed {
    my $filename = shift;

    my $newfile = get_correct_feed_filename($filename);
    if ($filename ne $newfile) {
        my $in = IO::File->new($filename, "r")
            or die "can not open $filename.\n";
        my $out = IO::File->new("$newfile", "w")
            or die "can not open $newfile.\n";
        while (my $line = <$in>) {
            chomp $line;
            $line =~ s{(<pubDate>.*?)GMT\+8(</pubDate>)}{$1+0800$2};
            print $out "$line\n";
        }
        close $in;
        close $out;
        return $newfile;
    }

    return $filename;
}

sub get_subscription {
    my $url = shift;
    return $subscriptions{$url};
}

sub get_new_always {
    my $url = shift;
    return exists $always_get_new{$url};
}

sub get_digest_string {
    my ($url, $feed) = @_;
    if (exists $always_get_new{$url}->{$feed}) {
        # convert number to string by quoting.
        return "$always_get_new{$url}->{$feed}";
    }
    else {
        return "N/A";
    }
}

sub set_digest {
    my ($url, $feed, $digest) = @_;
    $always_get_new{$url}->{$feed} = $digest;
}

sub list_items {
    my $htmlUrl = shift;
    my $ua = LWP::UserAgent->new;
    $ua->timeout(30);

    my @items;
    for my $url (@$htmlUrl) {
        my $s = get_subscription($url);
        my $filename = catfile(make_directory($s->{xmlUrl}), "feed");

        # list of available feeds
        my @feeds;
        if (exists $s->{feeds}) {
            @feeds = (ref $s->{feeds}) ?
                @{$s->{feeds}} : ($s->{feeds});
        }
        unshift @feeds, $s->{xmlUrl};

        # get update date and new entries of feed
        my $update;
        my @new_feed_entries;
        if (defined $parse_only) {
            $filename = get_correct_feed_filename($filename);
            # update date is never undefined when parsing only.
            $update = get_last_update_date($s->{htmlUrl});
            if (defined $update) {
                @new_feed_entries = get_previous_entries($filename);
            }
            else {
                my $r = get_latest_update_date($filename);
                if (defined $r) {
                    $update = $r->{date};
                    @new_feed_entries = @{$r->{entries}};
                }
            }
            $update = $now unless defined $update;
        }
        else {
            my $changed = 0;
            if (get_new_always($url)) {
                $changed = get_new($ua, $filename, $url, @feeds);
            }
            else {
                $changed = update_feed($ua, $filename, @feeds);
            }


            # reduce times of parsing feed
            my $prev_update = get_last_update_date($s->{htmlUrl});
            if ($changed != 0 || !defined $prev_update) {
                $filename = correct_feed($filename);
                my $r = get_latest_update_date($filename, $prev_update);
                if (defined $r) {
                    $update = $r->{date};
                    @new_feed_entries = @{$r->{entries}};
                }
		elsif (defined $remove_old_entries) {
		    pop_previous_entries($filename);
		}
            }
            else {
                $update = $prev_update;
                $filename = get_correct_feed_filename($filename);
		if (defined $remove_old_entries) {
		    @new_feed_entries = pop_previous_entries($filename);
		}
		else {
		    @new_feed_entries = get_previous_entries($filename);
		}
            }
        }

        my $num_new_feed_entries = scalar @new_feed_entries;
        $total_new_entries += $num_new_feed_entries;

        my $feed_info = fill_template ($s, $update, $num_new_feed_entries,
                                       \@new_feed_entries);
        push @items, $feed_info;
    }
    return \@items;
}

sub sort_items_by_date {
    my $items = shift;
    my @na = grep { ! UNIVERSAL::isa($_->{FEED_UPDATE_DATE}, 'DateTime') } @$items;
    my @others = grep { UNIVERSAL::isa($_->{FEED_UPDATE_DATE}, 'DateTime') } @$items;

    my @sorted_items = sort { DateTime->compare($b->{FEED_UPDATE_DATE}, $a->{FEED_UPDATE_DATE}) } @others;

    push @sorted_items, @na;
    return \@sorted_items;
}

sub generate_html_from_template {
    my ($number, $items) = @_;
    my $t = HTML::Template->new(filename => $templatefile);
    $t->param(FEEDNUM => $number);
    $t->param(NUM_NEW_ENTRIES => $total_new_entries);
    $t->param(DATE => $now);
    $t->param(SUBSCRIPTION => $items);

    open HTML, ">$output";
    print HTML encode_utf8($t->output);
    close HTML;
}

sub save_updated_info {
    for my $url (keys %last_update_date) {
        format_last_update_date($url);
    }
    DumpFile($datefile, \%last_update_date);
    DumpFile($entryfile, \%last_update_entries);
    DumpFile($getnewfile, \%always_get_new);
}

sub format_datetime_w3cdtf {
    my $entries = shift;

    my @result;
    for my $e (@$entries) {
        my %entry_info;
        for my $key (keys %$e) {
            if (blessed $e->{$key} && $e->{$key}->isa('DateTime')) {
                $entry_info{$key} = DateTime::Format::W3CDTF->format_datetime($e->{$key});
            }
            else {
                $entry_info{$key} = $e->{$key};
            }
        }
        push @result, \%entry_info;
    }
    return @result;
}

sub parse_datetime_from_w3cdtf {
    my $entries = shift;

    my @result;
    for my $e (@$entries) {
        my %entry_info;
        for my $key (keys %$e) {
            if ($key eq "issued" or $key eq "modified") {
                $entry_info{$key} = DateTime::Format::W3CDTF->parse_datetime($e->{$key});
            }
            else {
                $entry_info{$key} = $e->{$key};
            }
        }
        push @result, \%entry_info;
    }
    return @result;
}

sub get_previous_entries {
    my $filename = shift;
    return unless exists $last_update_entries{$filename};
    return @{$last_update_entries{$filename}};
}

sub pop_previous_entries {
    my $filename = shift;
    my @previous_entries = get_previous_entries($filename);
    if (exists $last_update_entries{$filename}) {
	delete $last_update_entries{$filename};
    }
    return @previous_entries;
}

sub set_last_update_entries {
    my ($filename, @entries) = @_;
    $last_update_entries{$filename} = \@entries;
}

sub filter_out_entries_after {
    my ($entries, $to) = @_;

    return grep { DateTime->compare($_->{date}, $to) <= 0 } @$entries;
}

# get entries between date range ($from, $to] since date of entries
# without date from feed will be assigned to $to.
sub get_entries_between {
    my ($entries, $from, $to) = @_;

    my @new = grep { DateTime->compare($_->{date}, $from) == 1 } @$entries;
    return unless @new;
    return filter_out_entries_after(\@new, $to);
}

sub get_latest_update_date {
    my ($filename, $prev_update) = @_;

    my @entries;
    return unless -e $filename and -f _;

    print STDERR "parse $filename...";
    my $feed;
    eval { $feed = XML::Feed->parse($filename); };
    if ($@) {
        print STDERR "bad feed\n";
        return;
    }
    print STDERR "done\n";

    return unless defined $feed;

    for my $entry ($feed->entries) {
        my %info;
        $info{title} = $entry->title;
        $info{link} = (defined $entry->link) ? $entry->link : $feed->link;

        my $m = $entry->modified;
        if (defined $m) {
            $info{modified} = $m;
            $info{modified}->set_time_zone('Asia/Shanghai');
        }

        $m = $entry->issued;
        my $last_entry_update = get_last_update_date($entry->link);
        $info{issued} = defined $m                 ? $m
                      : defined $last_entry_update ? $last_entry_update
                      :                              $now
                      ;
        $info{issued}->set_time_zone('Asia/Shanghai');

        push @entries, \%info;
    }

    return unless scalar @entries > 0;

    my $i;
    my @sort_candidates;
    for ($i = 0; $i <= $#entries; ++$i) {
        my $d;
        if (exists $entries[$i]->{modified}) {
            $d = $entries[$i]->{modified};
        }
        elsif (exists $entries[$i]->{issued}) {
            $d = $entries[$i]->{issued};
        }
        push @sort_candidates, { date => $d, index => $i };
    }

    my @sorted_entries
        = sort { DateTime->compare($b->{date}, $a->{date}) } @sort_candidates;

    my @new_entries = (! defined $prev_update) ?
        filter_out_entries_after(\@sorted_entries, $now)
            : get_entries_between(\@sorted_entries, $prev_update, $now);

    my @prev_entries;
    if (defined $remove_old_entries) {
	@prev_entries = pop_previous_entries($filename);
    }
    else {
	@prev_entries = get_previous_entries($filename);
    }
    my @parsed_prev_entries = parse_datetime_from_w3cdtf(\@prev_entries);
    # feed is modified but no new entry.
    return { date => $prev_update, entries => \@parsed_prev_entries }
        unless @new_entries;

    my $update = $new_entries[0]->{date};
    my @new_feed_entries = map { $entries[$_->{index}] } @new_entries;
    my @updated_entries = format_datetime_w3cdtf(\@new_feed_entries);

    push @new_feed_entries, @parsed_prev_entries;

    if (! defined $remove_old_entries) {
	push @updated_entries, @prev_entries;
    }
    set_last_update_entries($filename, @updated_entries);

    if (DateTime->compare($update, $now) == 0) {
        for (@new_entries) {
            my $index = $_->{index};
            my $date = $_->{date};
            my $link = $entries[$index]->{link};
            set_last_update_date($link, $date);
        }
    }

    return { date => $update, entries => \@new_feed_entries };
}

sub compute_hexdigest {
    my $filename = shift;

    my $sha = Digest::SHA->new(256);
    $sha->addfile($filename);
    return $sha->hexdigest;
}

sub get_new {
    my ($ua, $filename, $url, @feeds) = @_;

    for my $f (@feeds) {
        print STDERR "fetch $f ...";
        my $response = $ua->get($f, ':content_file' => $filename);
        if ($response->is_success) {
            my $hexdigest = compute_hexdigest($filename);
            my $prevdigest = get_digest_string($url, $f);

            if ($prevdigest eq $hexdigest) {
                print STDERR "up to date\n";
                return 0;
            }
            else {
                print STDERR "done\n";
                set_digest($url, $f, $hexdigest);
                return 1;
            }
        }
        print STDERR "skipped\n";
    }
    return 0;
}

sub update_feed {
    my ($ua, $filename, @feeds) = @_;

    for my $f (@feeds) {
        print STDERR "mirror $f ...";
        my $response;
        eval { $response = $ua->mirror($f, $filename); };
        if ($@) {
            print STDERR "failed\n";
            next;
        }

        if ($response->code == RC_NOT_MODIFIED) {
            print STDERR "up to date\n";
            return 0;
        }
        if ($response->is_success) {
            print STDERR "done\n";
            return 1;
        }
        print STDERR "skipped\n";
    }
    return 0;
}

sub fill_template {
    my ($s, $update, $num_new_feed_entries, $new_feed_entries) = @_;

    my %feed =
        (
            LINK => $s->{htmlUrl},
            TITLE => $s->{title},
            DEFAULT_FEED => $s->{xmlUrl},
            MORE => exists $s->{feeds},
            HAS_NEW_ENTRIES => $num_new_feed_entries,
        );

    if (exists $s->{feeds}) {
        my @others = (ref $s->{feeds}) ? @{$s->{feeds}} : ($s->{feeds});
        my @backup_feeds;
        push @backup_feeds, { feed => $_ } for @others;
        $feed{FEEDS} = \@backup_feeds;
    }

    if (defined $update) {
        $feed{FEED_UPDATE_DATE} = $update;
        set_last_update_date($feed{LINK}, $update);
    }
    else {
        $feed{FEED_UPDATE_DATE} = "N/A";
        remove_last_update_date($feed{LINK});
    }

    if ($num_new_feed_entries > 0) {
        $feed{NEW_ENTRIES} = $new_feed_entries;
    }

    return \%feed;
}

sub make_directory {
    my $xmlUrl = shift;
    my $uri = URI->new($xmlUrl);
    my $domain = $uri->host;
    my @segments = $uri->path_segments;
    pop @segments if $segments[-1] eq "";
    push @segments, $uri->query if defined $uri->query;
    my $path = catdir($feedman_home, $domain, @segments);
    unless (-e $path and -d _) {
        print STDERR "create path: $path\n";
        mkpath($path, 0, 0700);
    }
    return $path;
}

1;
