Skip to content
This repository was archived by the owner on Jul 10, 2019. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package com.digitalpebble.behemoth;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
Expand All @@ -25,7 +26,12 @@
* Implementation of an Annotation. Has a type , metadata and start and end
* offsets referring to the position in the text of a @class BehemothDocument.
**/
public class Annotation implements Comparable<Annotation> {
public class Annotation implements Comparable<Annotation>, Serializable {

/**
*
*/
private static final long serialVersionUID = 1L;

private String type;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import com.digitalpebble.behemoth.Annotation;

public class AnnotationsUtil {

/** Sort the annotations by startOffset **/
@SuppressWarnings("unchecked")
public static void sort(List<Annotation> input) {
Collections.sort(input, new AnnotationComparator());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,15 @@

package com.digitalpebble.behemoth.util;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;

import org.apache.commons.cli.CommandLine;
Expand All @@ -31,6 +38,7 @@
import org.apache.commons.compress.archivers.ArchiveOutputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
Expand All @@ -44,6 +52,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.digitalpebble.behemoth.Annotation;
import com.digitalpebble.behemoth.BehemothConfiguration;
import com.digitalpebble.behemoth.BehemothDocument;
import com.digitalpebble.behemoth.DocumentFilter;
Expand Down Expand Up @@ -74,6 +83,8 @@ public static FileNamingMode toMode(String str) {

// dump the text otherwise
private boolean dumpBinary = false;
// don't dump annotations by default
private boolean dumpAnnotations = false;

private ArchiveOutputStream currentArchive = null;

Expand Down Expand Up @@ -107,6 +118,7 @@ public int run(String[] args) throws Exception {
"dumps binary content, text otherwise");
options.addOption("n", "filenaming", true,
"whether to name files based on URL, UUID (default) or NUM");
options.addOption("a", "annotation", false, "whether to include annotation in output (off by default)");

// parse the command line arguments
try {
Expand All @@ -122,6 +134,7 @@ public int run(String[] args) throws Exception {
return -1;
}
dumpBinary = line.hasOption("binary");
dumpAnnotations = line.hasOption("annotation");

if (line.hasOption("filenaming")) {
String naming = line.getOptionValue("n");
Expand Down Expand Up @@ -203,6 +216,7 @@ private void addToArchive(String fileName, byte[] content, Path dirPath)
numEntriesInCurrentArchive++;
currentArchive.putArchiveEntry(new ZipArchiveEntry(fileName));
currentArchive.write(content);
LOG.debug("Successfully wrote BehemothDocument 'content' to output.");
currentArchive.closeArchiveEntry();
index.flush();
if (numEntriesInCurrentArchive == maxNumEntriesInArchive) {
Expand Down Expand Up @@ -250,11 +264,32 @@ else if (!dumpBinary && inputDoc.getText() == null)
fileName += ".txt";

byte[] contentBytes;
if (dumpBinary)
List<Annotation> annots = null;
if (dumpBinary) {
contentBytes = inputDoc.getContent();
else
if(dumpAnnotations) {
annots = inputDoc.getAnnotations();
//write annotations with content?
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(bos);
oos.writeObject(annots);
byte[] annotsBytes = bos.toByteArray();
contentBytes = concatAndDeepClone(contentBytes, annotsBytes);
}
} else {
contentBytes = inputDoc.getText().getBytes("UTF-8");
// out.write(contentBytes, 0, contentBytes.length);
if(dumpAnnotations) {
annots = inputDoc.getAnnotations();
ArrayList<String> annotsArrayList = new ArrayList<String>();
for (int i = 0; i < annots.size(); i++) {
annotsArrayList.add(annots.get(i).toString());
}

byte[] annotsBytes = annotsArrayList.toString().getBytes(Charset.forName("UTF-8"));
contentBytes = concatAndDeepClone(contentBytes, annotsBytes);
}
}

addToArchive(fileName, contentBytes, dir);

// add the mapping URL->filename in the index -> archive num
Expand All @@ -264,4 +299,10 @@ else if (!dumpBinary && inputDoc.getText() == null)
current.close();
}
}

private byte[] concatAndDeepClone(byte[] contentBytes, byte[] annotsBytes) {
byte[] concatBytes = ArrayUtils.addAll(contentBytes, annotsBytes);
contentBytes = concatBytes.clone();
return contentBytes;
}
}