|
def | __init__ (self) |
|
def | use_analyser (self, Analyser analyser) |
|
def | load_tokeniser (self, str hfstfile) |
|
def | fsa_tokenise (self, str line) |
|
def | python_tokenise (self, str line) |
|
def | tokenise (self, str line) |
|
def | accept (self, token) |
|
def | tokenise_sentence (self, str sentence) |
|
def | tokenise_plaintext (self, f) |
|
def | tokenise_conllu (self, f) |
|
def | tokenise_vislcg (self, f) |
|
|
| analyser |
|
| tokeniser |
|
| try_titlecase |
|
| try_detitlecase |
|
| try_uppercase |
|
| try_lowercase |
|
An object for omorfi’s morphological analysis.
◆ __init__()
def omorfi.tokeniser.Tokeniser.__init__ |
( |
|
self | ) |
|
Load analysis model from a file.
Args
f: containing single hfst automaton binary.
◆ accept()
def omorfi.tokeniser.Tokeniser.accept |
( |
|
self, |
|
|
|
token |
|
) |
| |
Check if the token is in the dictionary or not.
Returns:
False for OOVs, True otherwise. Note, that this is not
necessarily more efficient than bool(analyse(token))
◆ fsa_tokenise()
def omorfi.tokeniser.Tokeniser.fsa_tokenise |
( |
|
self, |
|
|
str |
line |
|
) |
| |
Tokenise with FSA.
Args:
line: string to tokenise
Todo:
Not implemented (needs pmatch python support)
◆ python_tokenise()
def omorfi.tokeniser.Tokeniser.python_tokenise |
( |
|
self, |
|
|
str |
line |
|
) |
| |
Tokenise with python's basic string functions.
Args:
line: string to tokenise
◆ tokenise()
def omorfi.tokeniser.Tokeniser.tokenise |
( |
|
self, |
|
|
str |
line |
|
) |
| |
Perform tokenisation with loaded tokeniser if any, or `split()`.
If tokeniser is available, it is applied to input line and if
result is achieved, it is split to tokens according to tokenisation
strategy and returned as a list.
If no tokeniser are present, or none give results, the line will be
tokenised using python's basic string functions. If analyser is
present, tokeniser will try harder to get some analyses for each
token using hard-coded list of extra splits.
Args:
line: a string to be tokenised, should contain a line of text or a
sentence
Returns:
A list of tokens based on the line. List may include boundary
non-tokens if e.g. sentence boundaries are recognised. For empty
line a paragraph break non-token may be returned.
◆ tokenise_conllu()
def omorfi.tokeniser.Tokeniser.tokenise_conllu |
( |
|
self, |
|
|
|
f |
|
) |
| |
tokenise a conllu sentence or comment.
Should be used a file-like iterable that has CONLL-U sentence or
comment or empty block coming up.
Args:
f: filelike object with iterable strings
Returns:
list of tokens
◆ tokenise_plaintext()
def omorfi.tokeniser.Tokeniser.tokenise_plaintext |
( |
|
self, |
|
|
|
f |
|
) |
| |
tokenise a whole text.
Args:
f: filelike object with iterable strings
Returns:
list of tokens
◆ tokenise_sentence()
def omorfi.tokeniser.Tokeniser.tokenise_sentence |
( |
|
self, |
|
|
str |
sentence |
|
) |
| |
tokenise a sentence.
To be used when text is already sentence-splitted. If the
text is plain text with sentence boundaries within lines,
use
Args:
sentence: a string containing one sentence
Returns:
list of tokens in sentence
◆ tokenise_vislcg()
def omorfi.tokeniser.Tokeniser.tokenise_vislcg |
( |
|
self, |
|
|
|
f |
|
) |
| |
Tokenises a sentence from VISL-CG format data.
Returns a list of tokens when it hits first non-token block, including
a token representing this non-token block.
Args:
f: filelike object to itrate strings of vislcg data
Returns:
list of tokens
The documentation for this class was generated from the following file:
- /home/flammie/github/flammie/omorfi/src/python/omorfi/tokeniser.py