import argparse from sklearn.dummy import DummyClassifier class EOSClassifier1: def train(self, trainX, trainY): self.clf = DummyClassifier(strategy='most_frequent') self.clf.fit(processedX, trainY) def classify(self, testX): # same as return ['EOS' for i in range(len(testX))] return self.clf.predict(testX) class EOSClassifier2: def train(self, trainX, trainY): self.abbrevs = load_wordlist('classes/abbrevs') def classify(self, testX): return ['NEOS' if x[3].lower() in self.abbrevs else 'EOS' for x in testX] def load_wordlist(file): with open(file) as fin: return set([x.strip() for x in fin.readlines()]) def load_data(file): with open(file) as fin: X = [] y = [] for line in fin: arr = line.strip().split() X.append(arr[1:]) y.append(arr[0]) return X, y def evaluate(outputs, golds): correct = 0 for h, y in zip(outputs, golds): if h == y: correct += 1 print(f'{correct} / {len(golds)} {correct / len(golds)}') def parseargs(): parser = argparse.ArgumentParser() parser.add_argument('--train', required=True) parser.add_argument('--test', required=True) parser.add_argument('--output') return parser.parse_args() def main(): args = parseargs() trainX, trainY = load_data(args.train) testX, testY = load_data(args.test) # classifier = EOSClassifier1() classifier = EOSClassifier2() classifier.train(trainX, trainY) outputs = classifier.classify(testX) if args.output is not None: with open(args.output, 'w') as fout: for output in outputs: print(output, file=fout) evaluate(outputs, testY) if __name__ == '__main__': main()