#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Pod::Usage; use utf8 (); =head1 SYNOPSIS import_csv.pl import_csv.pl --csv_path=/home/iain/Projects/ironman/csv --db_path=/home/iain/Projects/ironman/subscriptions.db This script is a one hit import from the old csv file to the new feed db. pass the csv_path and db_path e.g. import_csv.pl --csv_path=/home/iain/Projects/ironman/csv --db_path=/home/iain/Projects/ironman/subscriptions.db It is safe to run multiple times on the same data. A file called problems.txt will be generated with .... problems, usually missing dates. =cut use Data::Dumper; use IronMan::Schema; use Text::CSV_XS; use DateTime::Format::HTTP; my ( $csv_path, $db_path ); GetOptions( 'csv_path=s' => \$csv_path, 'db_path=s' => \$db_path, ) or die pod2usage; unless($csv_path && $db_path) { print "bugger"; die pod2usage; } $csv_path .= '/*'; my @files = glob($csv_path); my $schema = IronMan::Schema->connect("dbi:SQLite:$db_path"); print scalar(@files)," files found\n"; binmode(STDOUT, ":utf8"); open(PROBLEMS, '>', 'problems.txt') || die "cant open problems file $!"; FILES: foreach my $file (@files) { # make sure we have a valid filename and trim it to something we can feed into a DB query if($file =~ /my_(.*?)\.csv$/) { my $file_clean = $1; print "Processing file $file\n"; my $poster; my $feeds_rs = $schema->resultset('Feed')->search({ title => { like => $file_clean } }); # some problem with count, cant remember what though my @rows = $feeds_rs->all; if(scalar(@rows) > 0) { $poster = shift @rows; if(scalar(@rows) > 1) { # I dont think this happens anymore but there is no harm leaving it in. # if we have multiple posts it may be the same author on different blogs # loop though and check they are the same while(my $compare = shift @rows) { unless($compare->title eq $poster->title) { print PROBLEMS "Unable to determine author in file: $file\n"; next FILES; } } } } else { print PROBLEMS "No feed entry found for file: $file\n"; next; } print "\tfound author: ".$poster->title,"\n"; open my $fh, "<:encoding(utf8)", $file or die "$file: $!"; my $csv = Text::CSV_XS->new ({ binary => 1 }) or die "Cannot use CSV: ".Text::CSV->error_diag (); my %posts; my $posts_rs = $schema->resultset('Post'); while (my $row = $csv->getline ($fh)) { unless($row->[3]) { print PROBLEMS "no date found for post:$row->[1], file:$file\n"; next; } my $new_date = DateTime::Format::HTTP->parse_datetime($row->[3]); my $post = $posts_rs->find({url => $row->[2]}); # initialise the most recent post date if($post && !exists($posts{$row->[2]})) { $posts{$row->[2]} = $post->posted_on(); } if($post) { print "\tDupe Found ".$post->id.",".$post->title."\n"; # if this is a newer post update the details if($new_date > $posts{$row->[2]}) { #the post is more recent update all bar the date print "\t\tdupe more recent $new_date\n"; $post->update({ title => $row->[1], url => $row->[2], summary => $row->[4], summary_filtered => $row->[4], }); # track the most recent post date $posts{$row->[2]} = $post->posted_on(); } elsif($new_date < $post->posted_on()) { # post is older update the posted_on print "\t\tdupe older $new_date\n"; $post->update({ posted_on => $new_date, }); } else { print "\t\tDate the same, dont do anything\n"; } } else { # this is a new post eval { $post = $poster->create_related('posts', { title => $row->[1], url => $row->[2], author => $row->[0], tags => '', posted_on => $new_date, body => '', summary => $row->[4], summary_filtered => $row->[4], } ); }; if($@) { print PROBLEMS "Unable to create new post row, probably a dupe $@ \n"; print PROBLEMS Dumper({ title => $row->[1], url => $row->[2], author => $row->[0], tags => '', posted_on => $new_date->datetime(), body => '', summary => $row->[4], }); next; } print "\tnew post added ".$post->id.",".$post->title."\n"; # track the most recent post date $posts{$row->[2]} = $post->posted_on(); } } $csv->eof or $csv->error_diag (); close $fh; } }