Make Annotation's serializable and initial implementation of adding annotations to the Exporter output by lewismc · Pull Request #56 · DigitalPebble/behemoth

core/src/main/java/com/digitalpebble/behemoth/Annotation.java

-Original file line number
+Diff line change
@@ Expand Up / @@ -17,6 +17,7 @@ @@
     package com.digitalpebble.behemoth;
+    import java.io.Serializable;
     import java.util.HashMap;
     import java.util.Iterator;
     import java.util.Map;
@@ Expand All / @@ -25,7 +26,12 @@ @@
      * Implementation of an Annotation. Has a type , metadata and start and end
      * offsets referring to the position in the text of a @class BehemothDocument.
      **/
-    public class Annotation implements Comparable<Annotation> {
+    public class Annotation implements Comparable<Annotation>, Serializable {
+      /**
+       *
+       */
+      private static final long serialVersionUID = 1L;
         private String type;
@@ Expand Down @@

core/src/main/java/com/digitalpebble/behemoth/util/AnnotationsUtil.java

-Original file line number
+Diff line change
@@ Expand Up / @@ -21,13 +21,13 @@ @@
     import java.util.Collections;
     import java.util.Iterator;
     import java.util.List;
-    import java.util.Set;
     import com.digitalpebble.behemoth.Annotation;
     public class AnnotationsUtil {
         /** Sort the annotations by startOffset **/
+        @SuppressWarnings("unchecked")
         public static void sort(List<Annotation> input) {
             Collections.sort(input, new AnnotationComparator());
         }
@@ Expand Down @@

core/src/main/java/com/digitalpebble/behemoth/util/ContentExtractor.java

-Original file line number
+Diff line change
@@ Expand Up / @@ -17,8 +17,15 @@ @@
     package com.digitalpebble.behemoth.util;
+    import java.io.ByteArrayOutputStream;
     import java.io.IOException;
+    import java.io.ObjectOutputStream;
     import java.net.URLEncoder;
+    import java.nio.charset.Charset;
+    import java.util.ArrayList;
+    import java.util.Arrays;
+    import java.util.Iterator;
+    import java.util.List;
     import java.util.UUID;
     import org.apache.commons.cli.CommandLine;
@@ Expand All / @@ -31,6 +38,7 @@ @@
     import org.apache.commons.compress.archivers.ArchiveOutputStream;
     import org.apache.commons.compress.archivers.ArchiveStreamFactory;
     import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+    import org.apache.commons.lang.ArrayUtils;
     import org.apache.hadoop.conf.Configured;
     import org.apache.hadoop.fs.FSDataOutputStream;
     import org.apache.hadoop.fs.FileStatus;
@@ Expand All / @@ -44,6 +52,7 @@ @@
     import org.slf4j.Logger;
     import org.slf4j.LoggerFactory;
+    import com.digitalpebble.behemoth.Annotation;
     import com.digitalpebble.behemoth.BehemothConfiguration;
     import com.digitalpebble.behemoth.BehemothDocument;
     import com.digitalpebble.behemoth.DocumentFilter;
@@ Expand Down Expand Up / @@ -74,6 +83,8 @@ public static FileNamingMode toMode(String str) { @@
         // dump the text otherwise
         private boolean dumpBinary = false;
+        // don't dump annotations by default
+        private boolean dumpAnnotations = false;
         private ArchiveOutputStream currentArchive = null;
@@ Expand Down Expand Up / @@ -107,6 +118,7 @@ public int run(String[] args) throws Exception { @@
                     "dumps binary content, text otherwise");
             options.addOption("n", "filenaming", true,
                     "whether to name files based on URL, UUID (default) or NUM");
+            options.addOption("a", "annotation", false, "whether to include annotation in output (off by default)");
             // parse the command line arguments
             try {
@@ Expand All / @@ -122,6 +134,7 @@ public int run(String[] args) throws Exception { @@
                     return -1;
                 }
                 dumpBinary = line.hasOption("binary");
+                dumpAnnotations = line.hasOption("annotation");
                 if (line.hasOption("filenaming")) {
                     String naming = line.getOptionValue("n");
@@ Expand Down Expand Up @@
             numEntriesInCurrentArchive++;
             currentArchive.putArchiveEntry(new ZipArchiveEntry(fileName));
             currentArchive.write(content);
+            LOG.debug("Successfully wrote BehemothDocument 'content' to output.");
             currentArchive.closeArchiveEntry();
             index.flush();
             if (numEntriesInCurrentArchive == maxNumEntriesInArchive) {
@@ Expand Down Expand Up / @@ -250,11 +264,32 @@ else if (!dumpBinary && inputDoc.getText() == null) @@
                         fileName += ".txt";
                     byte[] contentBytes;
-                    if (dumpBinary)
+                    List<Annotation> annots = null;
+                    if (dumpBinary) {
                         contentBytes = inputDoc.getContent();
-                    else
+                        if(dumpAnnotations) {
+                            annots = inputDoc.getAnnotations();
+                            //write annotations with content?
+                            ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                            ObjectOutputStream oos = new ObjectOutputStream(bos);
+                            oos.writeObject(annots);
+                            byte[] annotsBytes = bos.toByteArray();
+                            contentBytes = concatAndDeepClone(contentBytes, annotsBytes);
+                        }
+                    } else {
                         contentBytes = inputDoc.getText().getBytes("UTF-8");
-                    // out.write(contentBytes, 0, contentBytes.length);
+                        if(dumpAnnotations) {
+                            annots = inputDoc.getAnnotations();
+                            ArrayList<String> annotsArrayList = new ArrayList<String>();
+                            for (int i = 0; i < annots.size(); i++) {
+                              annotsArrayList.add(annots.get(i).toString());
+                            }
+                            byte[] annotsBytes = annotsArrayList.toString().getBytes(Charset.forName("UTF-8"));
+                            contentBytes = concatAndDeepClone(contentBytes, annotsBytes);
+                        }
+                    }
                     addToArchive(fileName, contentBytes, dir);
                     // add the mapping URL->filename in the index -> archive num
@@ Expand All / @@ -264,4 +299,10 @@ else if (!dumpBinary && inputDoc.getText() == null) @@
                 current.close();
             }
         }
+        private byte[] concatAndDeepClone(byte[] contentBytes, byte[] annotsBytes) {
+          byte[] concatBytes = ArrayUtils.addAll(contentBytes, annotsBytes);
+          contentBytes = concatBytes.clone();
+          return contentBytes;
+        }
     }

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Make Annotation's serializable and initial implementation of adding annotations to the Exporter output #56

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Make Annotation's serializable and initial implementation of adding annotations to the Exporter output #56

Are you sure you want to change the base?

Uh oh!

Make Annotation's serializable and initial implementation of adding annotations to the Exporter output #56

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing