From 55c5a5291e2876ecbae55033861b086c9e7b5bea Mon Sep 17 00:00:00 2001 From: Tal Wrii Date: Sat, 10 Sep 2016 00:19:31 -0500 Subject: [PATCH 1/2] rsample -- use argparse In preparation for adding more options, namely bootstrap replacement and samples for various pseudorandom distribution --- bin/rsample | 52 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/bin/rsample b/bin/rsample index e21c7ee..0f5c60d 100755 --- a/bin/rsample +++ b/bin/rsample @@ -4,11 +4,11 @@ # Jim Bagrow # Last Modified: 2013-02-07 -# Code adapted from: http://data-analytics-tools.blogspot.com/2009/09/reservoir-sampling-algorithm-in-perl.html import sys, os import random - +import argparse + name = os.path.basename(sys.argv[0]) usage = \ """Usage: %s num_samples [FILENAME] @@ -16,28 +16,40 @@ usage = \ Randomly sample num_samples rows from STDIN or FILENAME and pass to STDOUT. Sampling is performed using Reservoir Sampling.""" % (name) -error = "Bad input, run `%s --help` for info." % (name) +def build_parser(): + parser = argparse.ArgumentParser(description='R|' + usage, formatter_class=SmartFormatter) + parser.add_argument('size', type=int) + parser.add_argument('filename', type=str, nargs='?') + return parser -if __name__ == '__main__': - if '-h' in sys.argv[1:] or '--help' in sys.argv[1:]: - sys.exit( usage ) - - if len(sys.argv) == 3: - input = open(sys.argv[2],'r') - elif len(sys.argv) == 2: - input = sys.stdin; - else: - sys.exit(error) - - N = int(sys.argv[1]); +# http://stackoverflow.com/questions/3853722/python-argparse-how-to-insert-newline-in-the-help-text +class SmartFormatter(argparse.HelpFormatter): + def _split_lines(self, text, width): + if text.startswith('R|'): + return text[2:].splitlines() + # this is the RawTextHelpFormatter._split_lines + return argparse.HelpFormatter._split_lines(self, text, width) + +def sample_without_replacement(size, input): + # Code adapted from: http://data-analytics-tools.blogspot.com/2009/09/reservoir-sampling-algorithm-in-perl.html sample = []; - for i,line in enumerate(input): - if i < N: + if i < size: sample.append(line) - elif i >= N and random.random() < N/float(i+1): + elif i >= size and random.random() < size/float(i+1): replace = random.randint(0,len(sample)-1) sample[replace] = line - - for line in sample: + return sample + + +def main(): + parser = build_parser() + args = parser.parse_args() + + stream = open(args.filename) if args.filename else sys.stdin + for line in sample_without_replacement(args.size, stream): sys.stdout.write(line) + + +if __name__ == '__main__': + main() From 9a75c6b79fe10fe2ca74c878dc3ae32706b06c99 Mon Sep 17 00:00:00 2001 From: Tal Wrii Date: Sat, 10 Sep 2016 00:35:37 -0500 Subject: [PATCH 2/2] rsample -- Sample with replacement --- bin/rsample | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/bin/rsample b/bin/rsample index 0f5c60d..a29bc6f 100755 --- a/bin/rsample +++ b/bin/rsample @@ -20,6 +20,7 @@ def build_parser(): parser = argparse.ArgumentParser(description='R|' + usage, formatter_class=SmartFormatter) parser.add_argument('size', type=int) parser.add_argument('filename', type=str, nargs='?') + parser.add_argument('--with-replacement', '-R', action='store_true', help='Sample with replacement') return parser # http://stackoverflow.com/questions/3853722/python-argparse-how-to-insert-newline-in-the-help-text @@ -33,7 +34,7 @@ class SmartFormatter(argparse.HelpFormatter): def sample_without_replacement(size, input): # Code adapted from: http://data-analytics-tools.blogspot.com/2009/09/reservoir-sampling-algorithm-in-perl.html sample = []; - for i,line in enumerate(input): + for i, line in enumerate(input): if i < size: sample.append(line) elif i >= size and random.random() < size/float(i+1): @@ -41,13 +42,33 @@ def sample_without_replacement(size, input): sample[replace] = line return sample +def sample_with_replacement(size, input): + sample = [] + for i, line in enumerate(input): + if i == 0: + sample = [line] * size + else: + for index in range(len(sample)): + # Each sample point should have a 1 / (i + 1) + # probability of being the new value + # It has an equal probablility of being any of the old + # values (inductive hypothesis), i.e i / (i + 1) / i = 1 / (i + 1) + if random.random() < 1 / float(i + 1): + sample[index] = line + return sample + def main(): parser = build_parser() args = parser.parse_args() + if args.with_replacement: + selection_method = sample_with_replacement + else: + selection_method = sample_without_replacement + stream = open(args.filename) if args.filename else sys.stdin - for line in sample_without_replacement(args.size, stream): + for line in selection_method(args.size, stream): sys.stdout.write(line)