From 76e72eba5ef9ab8e6f277acec25fdcfe16be1676 Mon Sep 17 00:00:00 2001
From: Lewis John McGibbney <lewis.j.mcgibbney@jpl.nasa.gov>
Date: Tue, 28 Jul 2015 01:14:48 -0400
Subject: [PATCH] Make Annotation's serializable and initial implementation of
 adding annotations to the Exporter output

---
 .../digitalpebble/behemoth/Annotation.java    |  8 +++-
 .../behemoth/util/AnnotationsUtil.java        |  2 +-
 .../behemoth/util/ContentExtractor.java       | 47 +++++++++++++++++--
 3 files changed, 52 insertions(+), 5 deletions(-)
diff --git a/core/src/main/java/com/digitalpebble/behemoth/Annotation.java b/core/src/main/java/com/digitalpebble/behemoth/Annotation.java
index 6855a00..98c7afb 100644
--- a/core/src/main/java/com/digitalpebble/behemoth/Annotation.java
+++ b/core/src/main/java/com/digitalpebble/behemoth/Annotation.java
@@ -17,6 +17,7 @@
 
 package com.digitalpebble.behemoth;
 
+import java.io.Serializable;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@@ -25,7 +26,12 @@
  * Implementation of an Annotation. Has a type , metadata and start and end
  * offsets referring to the position in the text of a @class BehemothDocument.
  **/
-public class Annotation implements Comparable<Annotation> {
+public class Annotation implements Comparable<Annotation>, Serializable {
+
+  /**
+   * 
+   */
+  private static final long serialVersionUID = 1L;
 
     private String type;
 
diff --git a/core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java b/core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java
index ce6bd8a..99de022 100644
--- a/core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java
+++ b/core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java
@@ -21,13 +21,13 @@
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Set;
 
 import com.digitalpebble.behemoth.Annotation;
 
 public class AnnotationsUtil {
 
     /** Sort the annotations by startOffset **/
+    @SuppressWarnings("unchecked")
     public static void sort(List<Annotation> input) {
         Collections.sort(input, new AnnotationComparator());
     }
diff --git a/core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java b/core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java
index ec699a5..d6b3347 100644
--- a/core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java
+++ b/core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java
@@ -17,8 +17,15 @@
 
 package com.digitalpebble.behemoth.util;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.ObjectOutputStream;
 import java.net.URLEncoder;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
 import java.util.UUID;
 
 import org.apache.commons.cli.CommandLine;
@@ -31,6 +38,7 @@
 import org.apache.commons.compress.archivers.ArchiveOutputStream;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.lang.ArrayUtils;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
@@ -44,6 +52,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.digitalpebble.behemoth.Annotation;
 import com.digitalpebble.behemoth.BehemothConfiguration;
 import com.digitalpebble.behemoth.BehemothDocument;
 import com.digitalpebble.behemoth.DocumentFilter;
@@ -74,6 +83,8 @@ public static FileNamingMode toMode(String str) {
 
     // dump the text otherwise
     private boolean dumpBinary = false;
+    // don't dump annotations by default
+    private boolean dumpAnnotations = false;
 
     private ArchiveOutputStream currentArchive = null;
 
@@ -107,6 +118,7 @@ public int run(String[] args) throws Exception {
                 "dumps binary content, text otherwise");
         options.addOption("n", "filenaming", true,
                 "whether to name files based on URL, UUID (default) or NUM");
+        options.addOption("a", "annotation", false, "whether to include annotation in output (off by default)");
 
         // parse the command line arguments
         try {
@@ -122,6 +134,7 @@ public int run(String[] args) throws Exception {
                 return -1;
             }
             dumpBinary = line.hasOption("binary");
+            dumpAnnotations = line.hasOption("annotation");
 
             if (line.hasOption("filenaming")) {
                 String naming = line.getOptionValue("n");
@@ -203,6 +216,7 @@ private void addToArchive(String fileName, byte[] content, Path dirPath)
         numEntriesInCurrentArchive++;
         currentArchive.putArchiveEntry(new ZipArchiveEntry(fileName));
         currentArchive.write(content);
+        LOG.debug("Successfully wrote BehemothDocument 'content' to output.");
         currentArchive.closeArchiveEntry();
         index.flush();
         if (numEntriesInCurrentArchive == maxNumEntriesInArchive) {
@@ -250,11 +264,32 @@ else if (!dumpBinary && inputDoc.getText() == null)
                     fileName += ".txt";
 
                 byte[] contentBytes;
-                if (dumpBinary)
+                List<Annotation> annots = null;
+                if (dumpBinary) {
                     contentBytes = inputDoc.getContent();
-                else
+                    if(dumpAnnotations) {
+                        annots = inputDoc.getAnnotations();
+                        //write annotations with content?
+                        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                        ObjectOutputStream oos = new ObjectOutputStream(bos);
+                        oos.writeObject(annots);
+                        byte[] annotsBytes = bos.toByteArray();
+                        contentBytes = concatAndDeepClone(contentBytes, annotsBytes);
+                    }
+                } else {
                     contentBytes = inputDoc.getText().getBytes("UTF-8");
-                // out.write(contentBytes, 0, contentBytes.length);
+                    if(dumpAnnotations) {
+                        annots = inputDoc.getAnnotations();
+                        ArrayList<String> annotsArrayList = new ArrayList<String>();
+                        for (int i = 0; i < annots.size(); i++) {
+                          annotsArrayList.add(annots.get(i).toString());
+                        }
+                        
+                        byte[] annotsBytes = annotsArrayList.toString().getBytes(Charset.forName("UTF-8"));
+                        contentBytes = concatAndDeepClone(contentBytes, annotsBytes);
+                    }
+                }
+                
                 addToArchive(fileName, contentBytes, dir);
 
                 // add the mapping URL->filename in the index -> archive num
@@ -264,4 +299,10 @@ else if (!dumpBinary && inputDoc.getText() == null)
             current.close();
         }
     }
+    
+    private byte[] concatAndDeepClone(byte[] contentBytes, byte[] annotsBytes) {
+      byte[] concatBytes = ArrayUtils.addAll(contentBytes, annotsBytes);
+      contentBytes = concatBytes.clone();
+      return contentBytes;
+    }
 }