#!/usr/bin/perl # # Copyright Albert Danial # May 20005 # This code is released under the GNU General Public License. # use warnings; use strict; use File::Basename; use IO::File; use DBI; use DBD::SQLite 1.00; use Text::CSV::Simple; use Time::HiRes qw( time ); use vars qw($opt_d $opt_t ); use Getopt::Std; my $script = basename $0; my $usage = " Usage: $script [options] Reads the 21 .csv files contained in http://baseball1.info/statistics/lahman52_csv.zip and from them creates either as a SQLite database or a text file of SQL create and insert commands (essentially a database dump). The schema is described in http://baseball1.com/statistics/readme52.txt The .csv files must be present in the current directory. Note: Indices are deliberately not created for before/after demonstration purposes. These are a good start: create index i1 on Master(playerid); create index i2 on Fielding(playerid); create index i3 on Batting(playerid); create index i4 on Batting(lgid); Sample session: cd /tmp mkdir work cd work wget http://baseball1.info/statistics/lahman52_csv.zip unzip lahman52_csv.zip ./$script -t 5000 baseball.sql Options: -d Create as an SQLite database instead of a text file of SQL statements (creating an SQLite database this way is much slower than first creating the text file then cat'ing the text file into sqlite like so: cat | sqlite baseball.db -t N Start a new transaction every N inserts. Default transaction size is 10,000. Limitation: Text::CSV::Simple is unable to parse one line from Master.csv. As this script runs an error that starts with Failed on 14587,\"velasgu01\"... will appear. "; die $usage unless @ARGV; my @csv_files = qw( Allstar.csv Fielding.csv Pitching.csv AwardsManagers.csv FieldingOF.csv PitchingPost.csv AwardsPlayers.csv FieldingPost.csv Salaries.csv AwardsShareManagers.csv HallOfFame.csv SeriesPost.csv AwardsSharePlayers.csv Managers.csv Teams.csv Batting.csv ManagersHalf.csv TeamsFranchises.csv BattingPost.csv Master.csv TeamsHalf.csv ); getopts('pt:'); $opt_t = 10_000 unless $opt_t; my $db_file = shift @ARGV; unlink $db_file if -r $db_file; my $start_time = time; my $dbh = DBI->connect("dbi:SQLite:dbname=$db_file", "", ""); if (!$opt_d) { # regular text file $dbh = new IO::File "> $db_file"; die "Unable to open $db_file for writing\n" unless defined $dbh; } else { # an SQLite database $dbh = DBI->connect("dbi:SQLite:dbname=$db_file", "", "") or die "Unable to create SQLite database file $db_file\n"; } create_schema($dbh); foreach my $file(sort @csv_files) { load_file($dbh, $file); } if (!$opt_d) { # regular text file $dbh->close; } else { # an SQLite database $dbh->disconnect; } my $end_time = time; printf "%6.2f s total\n", $end_time - $start_time; sub create_schema { # {{{1 my ($dbh) = @_; $/ = "\n\n"; # read paragraphs my @SQL = ; $/ = "\n"; # reset input record separator to default # don't bother with transactions for the table creation commands foreach my $sql_command (@SQL) { if (!$opt_d) { print $dbh $sql_command; } else { $dbh->do($sql_command) or die "Failed to do [$sql_command]: $dbh->errstr"; } } } # 1}}} sub load_file { # {{{1 my ($dbh , $file , ) = @_; my ($table_name, $st); if ($file =~ /^(\w+)\.csv$/) { $table_name = $1; } else { die "Expected a .csv file, got $file"; } my $start_time = time; open IN, $file or die "Unable to read $file: $!\n"; chomp(my $column_names = ); close IN; my @names = split(/,/, $column_names); my $nFields = scalar @names; my $parser = Text::CSV::Simple->new; # print "before reading $file\n"; my @data = $parser->read_file($file); # print "after reading $file\n"; shift @data; # lose the first line w/column names my $nInsert = 0; if (!$opt_d) { print $dbh "begin transaction;\n"; } else { $dbh->begin_work; # start a transaction } foreach my $line (@data) { my @fields = (); for (my $i = 0; $i < $nFields; $i++) { if (defined $line->[$i] and $line->[$i]) { $line->[$i] =~ s/"//g; # remove quotes $line->[$i] =~ s/\cM$//g; # remove trailing control-M if ($line->[$i] =~ /\D/) { # contains nondigit; wrap in quotes push @fields, '"' . $line->[$i] . '"'; } else { push @fields, $line->[$i]; } } else { # empty field push @fields, 'NULL'; } } my $F = join(",", @fields); my $sql_command = "insert into $table_name values ( $F );" ; if (!$opt_d) { print $dbh "$sql_command\n"; } else { $dbh->do($sql_command) or die "Failed to do [$sql_command]: $dbh->errstr"; } ++$nInsert; if (!($nInsert % $opt_t)) { if (!$opt_d) { print $dbh "commit;\n"; print $dbh "begin transaction;\n"; } else { $dbh->commit; # end the transaction # print "$nInsert inserts\n"; $dbh->begin_work; # start a transaction } } } my $end_time = time; if (!$opt_d) { print $dbh "commit;\n"; warn sprintf "%6.2f s to load %6d entries (%2.f rec/s) table %s\n", $end_time - $start_time , scalar @data , (scalar @data)/($end_time - $start_time), $table_name ; } else { $dbh->commit; # end the transaction printf "%6.2f s to write %6d entries (%2.f rec/s) to table %s\n", $end_time - $start_time , scalar @data , (scalar @data)/($end_time - $start_time), $table_name ; } } # 1}}} __END__ PRAGMA synchronous = OFF; create table Allstar ( -- All-Star appearances -- -- Original data from http://baseball1.info/statistics/lahman52_csv.zip -- Schema explained in http://baseball1.com/statistics/readme52.txt -- .cvs files converted to SQL create/insert commands with the Perl -- script http://danial.org/sqlite/lampsig/baseball_db -- playerID text , -- Player ID code yearID integer , -- Year lgID text -- League ); create table AwardsManagers ( -- awards won by managers managerID text , -- Manager ID code awardID text , -- Name of award won yearID integer , -- Year lgID text , -- League tie text , -- Award was a tie (Y or N) notes text -- Notes about the award ); create table AwardsPlayers ( -- awards won by players playerID text , -- Player ID code awardID text , -- Name of award won yearID integer , -- Year lgID text , -- League tie text , -- Award was a tie (Y or N) notes text -- Notes about the award ); create table AwardsShareManagers ( -- award voting for manager awards awardID text , -- name of award votes were received for yearID integer , -- Year lgID text , -- League managerID text , -- Manager ID code pointsWon integer , -- Number of points received pointsMax integer , -- Maximum number of points possible votesFirst integer -- Number of first place votes ); create table AwardsSharePlayers ( -- award voting for player awards awardID text , -- name of award votes were received for yearID integer , -- Year lgID text , -- League playerID text , -- Player ID code pointsWon integer , -- Number of points received pointsMax integer , -- Maximum number of points possible votesFirst integer -- Number of first place votes ); create table Batting ( -- batting statistics playerID text , -- Player ID code yearID integer , -- Year stint text , -- player's stint (order of appearances within a season) teamID text , -- Team lgID text , -- League G integer , -- Games AB integer , -- At Bats R integer , -- Runs H integer , -- Hits T2B integer , -- Doubles T3B integer , -- Triples HR integer , -- Homeruns RBI integer , -- Runs Batted In SB integer , -- Stolen Bases CS integer , -- Caught Stealing BB integer , -- Base on Balls SO integer , -- Strikeouts IBB integer , -- Intentional walks HBP integer , -- Hit by pitch SH integer , -- Sacrifice hits SF integer , -- Sacrifice flies GIDP integer -- Grounded into double plays ); create table BattingPost ( -- post-season batting statistics yearID integer , -- Year round text , -- Level of playoffs playerID text , -- Player ID code teamID text , -- Team lgID text , -- League G integer , -- Games AB integer , -- At Bats R integer , -- Runs H integer , -- Hits T2B integer , -- Doubles T3B integer , -- Triples HR integer , -- Homeruns RBI integer , -- Runs Batted In SB integer , -- Stolen Bases CS integer , -- Caught stealing BB integer , -- Base on Balls SO integer , -- Strikeouts IBB integer , -- Intentional walks HBP integer , -- Hit by pitch SH integer , -- Sacrifices SF integer , -- Sacrifice flies GIDP integer -- Grounded into double plays ); create table Fielding ( -- fielding statistics playerID text , -- Player ID code yearID integer , -- Year stint integer , -- player's stint (order of appearances within a season) teamID text , -- Team lgID text , -- League POS text , -- Position G integer , -- Games GS integer , -- Games Started InnOuts integer , -- Time played in the field expressed as outs PO integer , -- Putouts A integer , -- Assists E integer , -- Errors DP integer , -- Double Plays PB integer , -- Passed Balls ZR float -- Zone Rating ); create table FieldingOF ( -- outfield position data playerID text , -- Player ID code yearID integer , -- Year stint integer , -- player's stint (order of appearances within a season) Glf integer , -- Games played in left field Gcf integer , -- Games played in center field Grf integer -- Games played in right field ); create table FieldingPost ( -- post-season fielding data PlayerID text , -- Player ID code yearID integer , -- Year teamID text , -- Team lgID text , -- League round text , -- Level of playoffs POS text , -- Position G integer , -- Games GS integer , -- Games Started InnOuts integer , -- Time played in the field expressed as outs PO integer , -- Putouts A integer , -- Assists E integer , -- Errors DP integer , -- Double Plays TP integer , -- Triple Plays PB integer , -- Passed Balls SB integer , -- Stolen Bases allowed (by catcher) CS integer -- Caught STealing (by catcher) ); create table HallOfFame ( -- Hall of Fame voting data hofID text , -- Player ID code yearID integer , -- Year of ballot votedBy text , -- Method by which player was voted upon ballots integer , -- Total ballots cast in that year needed integer , -- Number of votes needed for selection in that year votes integer , -- Total votes received inducted text , -- Whether player was inducted by that vote or not (Y/N) category text -- Category in which can ); create table Managers ( -- managerial statistics managerID text , -- Player ID Number yearID integer , -- Year teamID text , -- Team lgID text , -- League inseason integer , -- Managerial order. Zero if individual managed the -- team the entire year. Otherwise denotes where the -- manager appeared in the managerial order (1 for -- first manager, 2 for second, etc.) in the managerial -- order (1 for first manager, 2 for second, etc.) G integer , -- Games managed W integer , -- Wins L integer , -- Losses rank integer , -- Team's final position in standings that year plyrMgr text -- Player Manager (denoted by 'Y') ); create table ManagersHalf ( -- split season data for managers managerID text , -- Manager ID code yearID integer , -- Year teamID text , -- Team lgID text , -- League inseason integer , -- Managerial order. Zero if individual managed the -- team the entire year. Otherwise denotes where the -- manager appeared in the managerial order (1 for -- first manager, 2 for second, etc.) in the managerial -- order (1 for first manager, 2 for second, etc.) half integer , -- First or second half of season G integer , -- Games managed W integer , -- Wins L integer , -- Losses rank integer -- Team's position in standings for the half ); create table Master ( -- Player names, DOB, and biographical info lahmanID integer , -- ID used in Lahman Database playerID text , -- A unique code asssigned to each player. managerID text , -- foreign key to Managers.managerID hofID text , -- foreign key to HallOfFame.hofID birthYear integer , -- Year player was born birthMonth integer , -- Month player was born birthDay integer , -- Day player was born birthCountry text , -- Country where player was born birthState text , -- State where player was born birthCity text , -- City where player was born deathYear integer , -- Year player died deathMonth integer , -- Month player died deathDay integer , -- Day player died deathCountry text , -- Country where player died deathState text , -- State where player died deathCity text , -- City where player died nameFirst text , -- Player's first name nameLast text , -- Player's last name nameNote text , -- Note about player's name (usually signifying -- that they changed their name or played under -- two differnt names) nameGiven text , -- Player's given name (typically first and middle) nameNick text , -- Player's nickname weight integer , -- Player's weight in pounds height integer , -- Player's height in inches bats text , -- Player's batting hand (left, right, or both) throws text , -- Player's throwing hand (left or right) debut text , -- Date of first major league appearance college text , -- College attended lahman40ID text , -- ID used in Lahman database version 4.0 lahman45ID text , -- ID used in Lahman database version 4.5 retroID text , -- ID used by retrosheet holtzID text , -- ID used by Sean Holtz's Baseball Almanac bbrefID text -- ID used by Baseball Reference website ); create table Pitching ( -- pitching statistics playerID text , -- Player ID code yearID integer , -- Year stint text , -- player's stint (order of appearances within a season) teamID text , -- Team lgID text , -- League W integer , -- Wins L integer , -- Losses G integer , -- Games GS integer , -- Games Started CG integer , -- Complete Games SHO integer , -- Shutouts SV integer , -- Saves IPouts integer , -- Outs Pitched (innings pitched x 3) H integer , -- Hits ER integer , -- Earned Runs HR integer , -- Homeruns BB integer , -- Walks SO integer , -- Strikeouts BAOpp integer , -- Opponent's Batting Average ERA float , -- Earned Run Average IBB integer , -- Intentional Walks WP integer , -- Wild Pitches HBP integer , -- Batters Hit By Pitch BK integer , -- Balks BFP integer , -- Batters faced by Pitcher GF integer , -- Games Finished R integer -- Runs Allowed ); create table PitchingPost ( -- post-season pitching statistics playerID text , -- Year yearID text , -- Level of playoffs round text , -- Player ID code teamID text , -- Team lgID text , -- League W integer , -- Wins L integer , -- Losses G integer , -- Games GS integer , -- Games Started CG integer , -- Complete Games SHO integer , -- Shutouts SV integer , -- Saves IPouts integer , -- Outs Pitched (innings pitched x 3) H integer , -- Hits ER integer , -- Earned Runs HR integer , -- Homeruns BB integer , -- Walks SO integer , -- Strikeouts BAOpp integer , -- Opponents' batting average ERA float , -- Earned Run Average IBB float , -- Intentional Walks WP integer , -- Wild Pitches HBP integer , -- Batters Hit By Pitch BK integer , -- Balks BFP integer , -- Batters faced by Pitcher GF integer , -- Games Finished R integer , -- Runs Allowed SH integer , -- Sacrifice Hits allowed SF integer , -- Sacrifice Flies allowed GIDP integer -- Grounded into Double Plays ); create table Salaries ( -- player salary data yearID integer , -- Year teamID text , -- Team lgID text , -- League playerID text , -- Player ID code salary float -- Salary ); create table SeriesPost ( -- post-season series information yearID integer , -- Year round text , -- Level of playoffs teamIDwinner text , -- Team ID of the team that won the series lgIDwinner text , -- League ID of the team that won the series teamIDloser text , -- Team ID of the team that lost the series lgIDloser text , -- League ID of the team that lost the series wins integer , -- Wins by team that won the series losses integer , -- Losses by team that won the series ties integer -- Tie games ); create table Teams ( -- yearly stats and standings yearID integer , -- Year lgID text , -- League teamID text , -- Team franchID text , -- Franchise (links to TeamsFranchise table) divID text , -- Team's division Rank integer , -- Position in final standings G integer , -- Games played Ghome integer , -- Games played at home W integer , -- Wins L integer , -- Losses DivWin text , -- Division Winner (Y or N) WCWin text , -- Wild Card Winner (Y or N) LgWin text , -- League Champion(Y or N) WSWin text , -- World Series Winner (Y or N) R integer , -- Runs scored AB integer , -- At bats H integer , -- Hits by batters T2B integer , -- Doubles T3B integer , -- Triples HR integer , -- Homeruns by batters BB integer , -- Walks by batters SO integer , -- Strikeouts by batters SB integer , -- Stolen bases CS integer , -- Caught stealing HBP integer , -- Batters hit by pitch SF integer , -- Sacrifice flies RA integer , -- Opponents runs scored ER integer , -- Earned runs allowed ERA integer , -- Earned run average CG integer , -- Complete games SHO integer , -- Shutouts SV integer , -- Saves IPouts integer , -- Outs Pitched (innings pitched x 3) HA integer , -- Hits allowed HRA integer , -- Homeruns allowed BBA integer , -- Walks allowed SOA integer , -- Strikeouts by pitchers E integer , -- Errors DP integer , -- Double Plays FP float , -- Fielding percentage name text , -- Team's full name park text , -- Name of team's home ballpark attendance integer , -- Home attendance total BPF integer , -- Three-year park factor for batters PPF integer , -- Three-year park factor for pitchers teamIDBR text , -- Team ID used by Baseball Reference website teamIDlahman45 text , -- Team ID used in Lahman database version 4.5 teamIDretro text -- Team ID used by Retrosheet ); create table TeamsFranchises ( -- franchise information franchID text , -- Franchise ID franchName text , -- Franchise name active text , -- Whetehr team is currently active (Y or N) NAassoc text -- ID of National Association team franchise played as ); create table TeamsHalf ( -- split season data for teams yearID text , -- Year lgID text , -- Team teamID text , -- League Half text , -- First or second half of season divID text , -- Division DivWin text , -- Won Division (Y or N) Rank integer , -- Team's position in standings for the half G integer , -- Games played W integer , -- Wins L integer -- Losses -- -- Original data from http://baseball1.info/statistics/lahman52_csv.zip -- Schema explained in http://baseball1.com/statistics/readme52.txt -- .cvs files converted to SQL create/insert commands with the Perl -- script http://danial.org/sqlite/lampsig/baseball_db -- );