Revision 48 (by gdshaw@RISCID.ORG, 2010/11/15 19:58:04) Modified parser to ignore word counts in lexicon files.
// © 2010 Graham Shaw.
// Copying and distribution of this file, with or without modification,
// are permitted in any medium without royalty provided the copyright
// notice and this notice are preserved.  This file is offered as-is,
// without any warranty.

#include <stdexcept>
#include <iostream>
#include <fstream>
#include <string>

#include "lexicon.h"

namespace {

/** Split string into components.
 * @param s the string to be split
 * @param c the separator character
 * @return the comma-separated components
 */
std::vector<std::string> split(const std::string& s,char c)
{
	std::vector<std::string> result;
	std::string::size_type i=s.find(c);
	std::string::size_type j=0;
	while (i!=std::string::npos)
	{
		result.push_back(s.substr(j,i-j));
		j=i+1;
		i=s.find(c,j);
	}
	result.push_back(s.substr(j,s.length()-j));
	return result;
}

}; /* anonymous namespace */

void lexicon::read(const std::string& pathname)
{
	std::vector<tagset> tagsets;
	std::ifstream in(pathname.c_str());
	while (in&&!in.eof())
	{
		std::string line;
		std::getline(in,line);
		if ((line.size()==0)||(line[0]=='#'))
		{
			// Ignore blank lines and comments.
		}
		else if ((line[0]=='[')&&(line[line.size()-1]==']'))
		{
			// Record tag tuple, for application to
			// subsequent word tuples.
			tagsets.clear();
			std::vector<std::string> tagset_strs=split(line.substr(1,line.size()-2),',');
			for (size_t i=0;i!=tagset_strs.size();++i)
			{
				tagset tags;
				std::vector<std::string> tag_strs=split(tagset_strs[i],'+');
				for (size_t j=0;j!=tag_strs.size();++j)
				{
					tags|=tag(tag_strs[j]);
				}
				tagsets.push_back(tags);
			}
		}
		else
		{
			// Apply tags to word tuple.
			std::vector<std::string> words=split(line,',');
			if (tagsets.size()==0)
			{
				throw std::runtime_error("no tags specified");
			}
			if (words.size()!=tagsets.size())
			{
				throw std::runtime_error(
					"mismatch between word and tag tuples");
			}
			for (size_t i=0;i!=words.size();++i)
			{
				std::string word=words[i];
				std::string::size_type f=word.find('(');
				if (f!=std::string::npos)
				{
					word.erase(f);
				}
				_words[word]|=tagsets[i];
			}
		}
	}
}

tagset lexicon::find(const std::string& word) const
{
	words_type::const_iterator f=_words.find(word);
	return (f!=_words.end())?f->second:tagset();
}