import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.util.Map;
import java.util.LinkedHashMap;
import java.util.HashSet;
import java.util.Arrays;
/**
* Perform n-gram analysis of a string.
*
* Analyses the frequency with which distinct n-grams, of length n,
* appear in an input string. For the purposes of all analyses of the input
* string, the final n-1 n-grams appearing in the string should be
* “filled out” to a length of n characters, by adding
* a sequence of contiguous characters from the start of the string.
* e.g. “abbc” includes “bca” and “cab” in its 3-grams
*/
public class NgramAnalyser {
/** dictionary of all distinct n-grams and their frequencies */
private HashMap<String,Integer> ngram;
/** number of distinct characters in the input */
private int alphabetSize;
/** n-gram size for this object (new field) */
private int ngramSize;
/**
* Analyse the frequency with which distinct n-grams, of length n,
* appear in an input string.
* n-grams at the end of the string wrap to the front
* e.g. “abbbbc” includes “bca” and “cab” in its 3-grams
* ngram Hashmap is sorted alphabetically after being completed
* @param int n size of n-grams to create
* @param String inp input string to be modelled
*/
public NgramAnalyser(int n, String inp) {
this.checkNgramInput(n, inp);
this.ngramSize = n;
this.ngram = new HashMap<String, Integer>(inp.length());
ArrayList<Character> distinctLetters = new ArrayList<Character>();
String ngramTemp;
Character charTemp;
int ngramOccurance;
for (int i = 0; i < inp.length(); i++) {
ngramTemp = this.splice(inp, i, ngramSize);
charTemp = inp.charAt(i);
if (this.ngram.containsKey(ngramTemp)) {
ngramOccurance = this.ngram.get(ngramTemp);
this.ngram.put(ngramTemp, ngramOccurance + 1);
} else {
this.ngram.put(ngramTemp, 1);
}
if (!distinctLetters.contains(charTemp)) {
distinctLetters.add(charTemp);
}
}
this.alphabetSize = distinctLetters.size();
this.ngramSort();
}
/**
* Analyses the input text for n-grams of size 1.
*/
public NgramAnalyser(String inp) {
this(1,inp);
}
/**
* @return int the size of the alphabet of a given input
*/
public int getAlphabetSize() {
return this.alphabetSize;
}
/**
* @return the total number of distinct n-grams appearing
* in the input text.
*/
public int getDistinctNgramCount() {
Set<String> distinctNgrams = this.getDistinctNgrams();
return distinctNgrams.size();
}
/**
* @return Return a set containing all the distinct n-grams
* in the input string.
*/
public Set<String> getDistinctNgrams() {
Set<String> distinctNgrams = this.ngram.keySet();
return distinctNgrams;
}
/**
* @return the total number of n-grams appearing
* in the input text (not requiring them to be distinct)
* should be equal to the length of the input string
*/
public int getNgramCount() {
int ngramcount = 0;
for (Map.Entry<String, Integer> entry : this.ngram.entrySet()) {
ngramcount += entry.getValue();
}
return ngramcount;
}
/**
* Return the frequency with which a particular n-gram appears
* in the text. If it does not appear at all, return 0.
*
* @param ngram The n-gram to get the frequency of
* @return The frequency with which the n-gram appears.
*/
public int getNgramFrequency(String ngram) {
for (Map.Entry<String, Integer> entry : this.ngram.entrySet()) {
if (ngram.equals(entry.getKey())) {
return entry.getValue();
}
}
return 0;
}
/**
* Generate a summary of the ngrams for this object.
* @return a string representation of the n-grams in the input text
* comprising the ngram size and then each ngram and its frequency
* where ngrams are presented in alphabetical order.
*/
public String toString() {
String outputString = Integer.toString(this.ngramSize);
outputString = outputString + “n”;
outputString = outputString + this.printHashMap(this.ngram);
return outputString;
}
/**
* @return a sub-string specified by the starting index (inclusive) and its length
* If startIndex + length > str.length(), the resulting sub-string will wrap around the end
* Hence splice(“hello”, 3, 4) returns “lohe” (“lo” from the end and “he” from the start)
* And splice (“burger”, 2, 3) returns “rge”
* @throws an IndexOutOfBounds exception if startIndex is negative
* or length is greater than str.length()
* @param s – string to splice
* @param startIndex – start of splice (inclusive)
* @param length – length of splice (number of character in sub-string)
* Public for use in ModelMatcher.java to find sequences.
*/
public static String splice(String s, int startIndex, int length) {
if (length > s.length()) {
throw new IndexOutOfBoundsException
(“NgramAnalyser: Length of substring greater than parent string”);
}
int endIndex = (startIndex + length) % s.length();
if (endIndex <= startIndex ) {
return s.substring(startIndex, s.length()) + s.substring(0, endIndex);
} else {
return s.substring(startIndex, endIndex);
}
}
/**
* Sort am ngram Hashmap by it’s keys (alphabetically)
* @return The sorted hashmap
* Requires the hashmap to have strings for keys and
* integers for values, as per the ngram hashmap
* for both the parameter and the returned map.
* @param map – The hashmap to be sorted
* @throws an IllegalArgument exception if map is not initialized/empty
*/
private HashMap<String, Integer> mapSort(HashMap<String, Integer> map) {
if (map == null || map.size() == 0) {
throw new IllegalArgumentException(“mapSort: map must be initialized and non-empty”);
}
Object[] keysObject = map.keySet().toArray();
//Convert keys object array to keys string array
String[] keys = Arrays.copyOf(keysObject, keysObject.length, String[].class);
Arrays.sort(keys);
HashMap<String, Integer> newMap = new LinkedHashMap<String, Integer>();
int newValue;
for (String newKey : keys) {
newValue = map.get(newKey);
newMap.put(newKey, newValue);
}
return newMap;
}
/**
* Sorts the classes ngram hashmap
* @return the sorted hashmap
* Calls mapSort with parameter this.ngram
*/
private void ngramSort() {
this.ngram = this.mapSort(this.ngram);
}
/**
* Helper function that runs the input of an Ngram constructor
* against some basic checks to sanitize inputs.
* @throws an IllegalArgumentException if the input fields are unsuitable:
* – input string is empty or null
* – ngram size is smaller than zero or
* greater than the length of the input string
*/
private void checkNgramInput(int n, String input) {
if (input == null) {
throw new IllegalArgumentException(“NgramAnalyser: Input string cannot be null”);
} else if (input == “”) {
throw new IllegalArgumentException(“NgramAnalyser: Input string cannot be empty”);
}
if (n <= 0 || n > input.length()) {
throw new IllegalArgumentException
(“NgramAnalyser: ngram size is out of string index bounds”);
}
}
/**
* Utility function to print out a Hashmap in a table format as per project specs
* @param – map to be printed
* @return a string of the map table
* each entry is printed on its own line, with key and value seperated by a space
* No newline characters precede or follow the table string
* @throws an IllegalArgumentException if the map is null/empty
* Generic types used to handle both doubles and integers
*/
public static <N extends Number> String printHashMap(HashMap<String, N> map) {
String outputString = “”;
for (Map.Entry<String, N> entry : map.entrySet()) {
outputString = outputString + entry.getKey() + ” ” + entry.getValue().toString();
outputString = outputString + “n”;
}
//Remove final n
outputString = NgramAnalyser.splice(outputString, 0, outputString.length() – 1);
return outputString;
}
}
==========================================================================================================
//ProjectTest.java
import static org.junit.Assert.*;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.util.Set;
/**
* The test class ProjectTest for student test cases.
* Add all new test cases to this task.
*
*/
public class ProjectTest
{
/**
* Default constructor for test class ProjectTest
*/
public ProjectTest() {
}
/**
* Sets up the test fixture.
*
* Called before every test case method.
*/
@Before
public void setUp() {
}
/**
* Tears down the test fixture.
*
* Called after every test case method.
*/
@After
public void tearDown() {
}
//TODO add new test cases from here include brief documentation
/**
* Test if the number of lines in a string output from Ngram.toString()
* is valid (i.e equal to the size of the alphabet of that Ngram)
* Also ensures that the sort, splice and constructor functions work
* as required to produce the required comparison
*/
@Test(timeout=1000)
public void testSensibleToStringSize() {
String[] stringsToTest = {“Hello my friend”,
“be”,
“Have a nice day you filthy animal”,
“asdfghjkl$$sdfghj%%”,
“2”,
“adadadadaaaaa”,
” “};
Integer[] ngramSizesToTest = {1, 2, 3, 4, 5};
NgramAnalyser analysis;
String analysisString;
for (String s : stringsToTest) {
for (int i : ngramSizesToTest) {
if (i > s.length()) {
try {
analysis = new NgramAnalyser(i, s);
} catch (IllegalArgumentException e) {
assertEquals(0, 0);
}
} else {
analysis = new NgramAnalyser(i, s);
analysisString = analysis.toString();
//Number of lines is equal to the number of n’s plus 1
int numberofLines = analysisString.length() –
analysisString.replace(“n”, “”).length() + 1;
assert(numberofLines >= analysis.getAlphabetSize());
}
}
}
}
/**
* Tests various aspects of the getDistinctNgrams function
* inlcuding set length with comparison to basic boundaries
*/
@Test(timeout=1000)
public void testGetDistinctNgrams() {
String[] stringsToTest = {
“123!@#123!@#”,
“adadadadadadadad”,
“cadadcdaadcdbed”,
“aaaaaa”,
“HOWWEYVUFXBINEF”,
“ba”,
“a e”
};
for (String stringToTest : stringsToTest) {
int ngramSize = 2;
NgramAnalyser analysis = new NgramAnalyser(ngramSize, stringToTest);
Set<String> distinctNgrams = analysis.getDistinctNgrams();
int distinctNgramCount = analysis.getDistinctNgramCount();
int totalNgramCount = analysis.getNgramCount();
//Test that there are fewer or equal distinct Ngrams than total Ngrams
assert(distinctNgramCount <= totalNgramCount);
//Test that there are fewer or equal distinct Ngrams than the size
//of the analysed string
assert(distinctNgramCount <= stringToTest.length());
//Test that the alphabet size is smaller than
//or equal to the number of distinct NGrams
assert(analysis.getAlphabetSize() <= distinctNgramCount);
}
}
/**
* Tests the NgramAnalyser function for more complicated and longer ngrams
* Shore up and finish sooner or later
*/
@Test(timeout=1000)
public void testNgramAnalyser() {
//Realistic testing – e.g on long strings
String[] stringsToTest = {
“modelMatcher.get(this.getAlphabetSize() – 1 the 15##&&&”,
“The quick brown fox jumped over the lazy dog”,
“The probability we obtain from this calculation may be very”+
“small – in fact, potentially so small as to be indistinguishable”+
“from zero when using Java’s built-in floating-point arithmetic.”+
“Therefore, we will calculate and express the likelihood using”,
“In this project, your task will be to write a computer program”+
“that can (usually) predict whether text samples such as these”+
“are more likely to have been written by one author or another, “+
“be from one language or another and so on. There are many”+
“reasonable computational approaches to this problem. Our approach”+
“will not be to try to solve the problem as an intelligent human”+
“would. Instead, in this project, we will consider one of the”+
“computational models: Markov models. We will begin by building”+
“a statistical model of the kind of language used in each category”+
“of texts we consider. For example, we will train a system using text”+
“by different authors. Then, given a new piece of text, we will”+
“determine who wrote it by measuring how well it fits each of the”+
“statistical models, and selecting the one that gives the best fit”
};
int[] ngramSizesToTest = {2, 4, 6, 10};
for (String stringToTest : stringsToTest) {
for (int ngramSize : ngramSizesToTest) {
NgramAnalyser analysis = new NgramAnalyser(ngramSize, stringToTest);
//Test toString method
String toString = analysis.toString();
//System.out.println(toString);
//Test that ngramCount = length of the string
assert(analysis.getNgramCount() == stringToTest.length());
//Test that the alphabet size doesn’t exceed the length of the string
assert(analysis.getAlphabetSize() <= stringToTest.length());
//
//Perform some basic tests done in getDistinctNgram tests.
assert(analysis.getDistinctNgramCount() <= analysis.getNgramCount());
assert(analysis.getDistinctNgramCount() <= stringToTest.length());
assert(analysis.getAlphabetSize() <= analysis.getDistinctNgramCount());
//Ensure each n-gram is of the correct length
for (String ngram : analysis.getDistinctNgrams()) {
assert(ngram.length() == ngramSize);
}
}
}
//Specific cases
//1-grams from a 1 length string
NgramAnalyser analysis = new NgramAnalyser(1, “1”);
assert(analysis.getNgramCount() == 1);
assert(analysis.getAlphabetSize() == 1);
assert(analysis.getDistinctNgrams().size() == 1);
//n-gram from an n-length string (15 in this case)
String s = “djaaaabaaaabcda”;
analysis = new NgramAnalyser(15, s);
assert(analysis.getDistinctNgrams().contains(s));
}