Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 54 additions & 21 deletions bin/rsample
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,73 @@
# Jim Bagrow
# Last Modified: 2013-02-07

# Code adapted from: http://data-analytics-tools.blogspot.com/2009/09/reservoir-sampling-algorithm-in-perl.html

import sys, os
import random

import argparse

name = os.path.basename(sys.argv[0])
usage = \
"""Usage: %s num_samples [FILENAME]

Randomly sample num_samples rows from STDIN or FILENAME and pass to STDOUT.
Sampling is performed using Reservoir Sampling.""" % (name)

error = "Bad input, run `%s --help` for info." % (name)
def build_parser():
parser = argparse.ArgumentParser(description='R|' + usage, formatter_class=SmartFormatter)
parser.add_argument('size', type=int)
parser.add_argument('filename', type=str, nargs='?')
parser.add_argument('--with-replacement', '-R', action='store_true', help='Sample with replacement')
return parser

if __name__ == '__main__':
if '-h' in sys.argv[1:] or '--help' in sys.argv[1:]:
sys.exit( usage )

if len(sys.argv) == 3:
input = open(sys.argv[2],'r')
elif len(sys.argv) == 2:
input = sys.stdin;
else:
sys.exit(error)

N = int(sys.argv[1]);
# http://stackoverflow.com/questions/3853722/python-argparse-how-to-insert-newline-in-the-help-text
class SmartFormatter(argparse.HelpFormatter):
def _split_lines(self, text, width):
if text.startswith('R|'):
return text[2:].splitlines()
# this is the RawTextHelpFormatter._split_lines
return argparse.HelpFormatter._split_lines(self, text, width)

def sample_without_replacement(size, input):
# Code adapted from: http://data-analytics-tools.blogspot.com/2009/09/reservoir-sampling-algorithm-in-perl.html
sample = [];

for i,line in enumerate(input):
if i < N:
for i, line in enumerate(input):
if i < size:
sample.append(line)
elif i >= N and random.random() < N/float(i+1):
elif i >= size and random.random() < size/float(i+1):
replace = random.randint(0,len(sample)-1)
sample[replace] = line

for line in sample:
return sample

def sample_with_replacement(size, input):
sample = []
for i, line in enumerate(input):
if i == 0:
sample = [line] * size
else:
for index in range(len(sample)):
# Each sample point should have a 1 / (i + 1)
# probability of being the new value
# It has an equal probablility of being any of the old
# values (inductive hypothesis), i.e i / (i + 1) / i = 1 / (i + 1)
if random.random() < 1 / float(i + 1):
sample[index] = line
return sample


def main():
parser = build_parser()
args = parser.parse_args()

if args.with_replacement:
selection_method = sample_with_replacement
else:
selection_method = sample_without_replacement

stream = open(args.filename) if args.filename else sys.stdin
for line in selection_method(args.size, stream):
sys.stdout.write(line)


if __name__ == '__main__':
main()