Logo Search packages:      
Sourcecode: bibledit version File versions  Download package

capitalization.cpp

/*
    Copyright (C) 2003-2006 Teus Benschop.

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

*/


#include "libraries.h"
#include "utilities.h"
#include "constants.h"
#include <gtk/gtk.h>
#include <fnmatch.h>


// Reference information, i.e. "where are we?"
ustring book;
int chapter;
ustring verse;
// Containers for commandline arguments.
set<gunichar> punctuation_followed_by_capitals;
set<gunichar> ignore_lower_case_following;
set<ustring> abbreviations;
bool allow_any_uncapitalized_prefixes;
set<ustring> uncapitalized_prefixes;
set<ustring> capitalized_suffixes;
// Location information.
ustring current_element;
// Verse text information.
vector<int> verse_chapter;
vector<ustring> verse_verse;
vector<size_t> verse_pointer;
ustring verse_text;


ustring get_context (GtkTextIter iter)
// Returns the context at iter: A couple of words before and after.
{
  GtkTextIter iter1 = iter;
  GtkTextIter iter2 = iter;
  gtk_text_iter_backward_word_starts (&iter1, 2);
  gtk_text_iter_forward_word_ends (&iter2, 2);
  return gtk_text_iter_get_text (&iter1, &iter2);
}


void get_chapter_and_verse (vector<int>& chapters, vector<ustring>& verses, 
                            vector<size_t>& pointers, GtkTextIter iter,
                            int& chapter, ustring& verse)
// Based on the inputs (chapters, verses, pointers, iter),
// it gets the chapter and verse we are now at.
{
  size_t offset = gtk_text_iter_get_offset (&iter);
  for (unsigned int i = pointers.size() - 1; i >= 0; i--) {
    if (offset >= pointers[i]) {
      chapter = chapters[i];
      verse = verses[i];
      return;
    }
  }
}


bool is_reference (GtkTextIter iter)
/*
Looks whether the text at iter looks like a reference.
A reference, e.g. Mt.5.5 or Mt.5:5 or John 10:5 follows a certain pattern,
while going through it. Some references are given without the bookname, e.g.
"10.5". Handle these too.
Patterns:
- digit, dot/colon, digit.
*/ 
{
  // To show whether this is a reference.
  bool isref = false;
  // Get the "reference": the bit of text around iter.
  GtkTextIter iter0 = iter;
  gtk_text_iter_backward_chars (&iter0, 4);
  gtk_text_iter_forward_chars (&iter, 4);
  ustring reference = gtk_text_iter_get_text (&iter0, &iter);
  // Make the pattern to compare with.
  ustring pattern = "*[0-9][:,.][0-9]*";
  // Do the matching.
  if (fnmatch (pattern.c_str(), reference.c_str(), 0) == 0) {
    isref = true;
  }
  // Return what we found.
  return isref;
}


void check_capitalization (vector<int>& chapters, vector<ustring>& verses, 
                           ustring& text, vector<size_t>& pointers,
                           bool end_check)
/*
Check capitalization in text.
If "end_check" is true, it also check for final sentence closing.
*/
{
  /*
  Note that this at first used gtk_text_iter_starts_sentence (&iter) and
  gtk_text_iter_ends_sentence (&iter), but these functions are not good enough,
  because do not work in several cases, like e.g. in the following line, it does
  not indicate the end of the sentence:
    As soon as the leaders of the tribes of Israel took their places, the 
    Israelites said, “How could such a horrible thing happen?"
  Therefore we use other means to check sentences.
  */
  
  // No check if there's no text.
  if (trim (text).empty())
    return;
  // Some variables needed.
  bool expect_capital_now = false;
  bool expect_capital_caused_by_reference = false;
  gunichar previous_char = 0;
  int localchapter = 0;
  ustring localverse = "0";
  GtkTextBuffer * textbuffer;
  textbuffer = gtk_text_buffer_new (NULL);
  gtk_text_buffer_set_text (textbuffer, text.c_str(), -1);
  GtkTextIter iter;
  gtk_text_buffer_get_start_iter (textbuffer, &iter);
  bool going = true;  
  while (going) {
    // Get the unicode character.
    gunichar unichar = gtk_text_iter_get_char (&iter);
    // See whether to expect a capital now.
    if (punctuation_followed_by_capitals.find (unichar) != punctuation_followed_by_capitals.end()) {
      // Ok, expect capital.
      expect_capital_now = true;
      expect_capital_caused_by_reference = false;
      // Was this expectation caused by a reference?
      if (is_reference (iter))
        expect_capital_caused_by_reference = true;
    }
    // If we expect a capital, and we find one, no longer look for one.
    if (expect_capital_now) {
      if (g_unichar_isupper (unichar)) {
        expect_capital_now = false;
      }
    }
    // If we expect a capital, and we get lower case, that might be trouble.
    if (expect_capital_now) {
      if (g_unichar_islower (unichar)) {
        // There is no trouble if it follows a character after which to ignore lower case.
        if (ignore_lower_case_following.find (previous_char) != ignore_lower_case_following.end()) {
          expect_capital_now = false;
        }
        // If the lowercase character follows an abbreviation, there is no trouble either.
        GtkTextIter iter2 = iter;
        gtk_text_iter_backward_word_start (&iter2);
        GtkTextIter iter3 = iter2;
        gtk_text_iter_forward_word_end (&iter3);
        gtk_text_iter_forward_char (&iter3);
        ustring abbreviation = gtk_text_iter_get_text (&iter2, &iter3);
        if (abbreviations.find (abbreviation) != abbreviations.end()) {
          expect_capital_now = false;
        }
        // If it follows a reference, there is no trouble.
        if (expect_capital_caused_by_reference)
          expect_capital_now = false;
        // Ok, give message.
        if (expect_capital_now) {
          // Determine chapter and verse.
          get_chapter_and_verse (chapters, verses, pointers, iter, localchapter, localverse);
          output_xml_message (book, localchapter, localverse, "Capital expected: " + get_context (iter));
        }
        // Only give one message about missing capitals in this context.
        expect_capital_now = false;
      }
    }      
    // Store this characters as the previous characters for the next round.
    if (g_unichar_isgraph (unichar))
      previous_char = unichar;
    // Next round.
    going = gtk_text_iter_forward_char (&iter);
  }
  // The sentence should be ended with proper punctuation.
  if (end_check) {
    if (expect_capital_now)
      if (g_unichar_isdigit (previous_char))
        expect_capital_now = false;
    if (!expect_capital_now) {
      output_xml_message (book, chapter, verse, "Unended sentence: " + get_context (iter));
    }
  }
  // Free memory
  g_object_unref (textbuffer);
}


void mixed_capitalization_message (ustring& word)
{
  ustring message = "Mixed capitalization: ";
  message.append (word);
  output_xml_message (book, chapter, verse, message);
}


void check_suspicious_capitalization (ustring& text)
/*
Checks on suspicious capitalization, like "bOat" or "BOat".
There are exceptions to this check.
*/
{
  // Load text into buffer.
  ustring text2 (text);
  text2.append (" ");
  GtkTextBuffer * textbuffer;
  textbuffer = gtk_text_buffer_new (NULL);
  gtk_text_buffer_set_text (textbuffer, text2.c_str(), -1);
  // Iterators.  
  GtkTextIter startiter, enditer;
  // Check all separate words.
  gtk_text_buffer_get_start_iter (textbuffer, &enditer);
  while (gtk_text_iter_forward_word_end (&enditer)) {
    startiter = enditer;
    gtk_text_iter_backward_word_start (&startiter);    
    vector<bool> capspattern;
    unsigned int capscount = 0;
    GtkTextIter iter = startiter;
    while (gtk_text_iter_in_range (&iter, &startiter, &enditer)) {
      bool upper = g_unichar_isupper (gtk_text_iter_get_char (&iter));
      capspattern.push_back (upper);
      if (upper) capscount++;
      gtk_text_iter_forward_char (&iter);
    }
    // No further checking if words are too short.
    if (capspattern.size() < 2)
      continue;
    // No further checking if only small letters.
    if (capscount == 0)
      continue;
    // No further checking if all capitals.
    if (capscount == capspattern.size())
      continue;
    // No further checking if first letter capitalized only.
    if ((capspattern[0]) && (capscount == 1))
      continue;
    // Ok, there could be a problem of mixed capitalization.
    // Get the prefix before the first capital, and the suffix after it.
    ustring word = gtk_text_iter_get_text (&startiter, &enditer);
    ustring uncapitalized_prefix;
    ustring capitalized_suffix;
    for (unsigned int i = 1; i < capspattern.size(); i++) {
      if (capspattern[i]) {
        uncapitalized_prefix = word.substr (0, i);
        capitalized_suffix = word.substr (i, word.length() - i);
        break;
      }
    }
    // See whether the suffix is properly capitalized.
    unsigned int suffix_capital_count = 0;
    for (unsigned int i = 0; i < capitalized_suffix.length(); i++) {
      if (g_unichar_isupper (g_utf8_get_char(capitalized_suffix.substr(i, 1).c_str())))
        suffix_capital_count++;
    }
    bool suffix_properly_capitalized = false;
    if (suffix_capital_count == 1)
      suffix_properly_capitalized = true;
    if (suffix_capital_count == capitalized_suffix.length())
      suffix_properly_capitalized = true;
    // Give message and continue if capitalization error in suffix, but only
    // if this so-called wrongly capitalized suffix has not been approved af.
    if (!suffix_properly_capitalized) {
      if (capitalized_suffixes.find (capitalized_suffix) == capitalized_suffixes.end ()) {
        mixed_capitalization_message (word);
        continue;
      }
    }
    // No further checking if this uncapitalized prefix is in the list,
    // or any is allowed.
    if (uncapitalized_prefixes.find (uncapitalized_prefix) != uncapitalized_prefixes.end())
      continue;
    if (allow_any_uncapitalized_prefixes)
      continue;
    // Ok, not in the list. Try again with lower case initial.
    ustring initial = uncapitalized_prefix.substr (0, 1);
    initial = initial.casefold();
    uncapitalized_prefix.replace (0, 1, initial);
    if (uncapitalized_prefixes.find (uncapitalized_prefix) != uncapitalized_prefixes.end())
      continue;
    // No further checking if the suffix is in the list of approved suffixes.
    if (capitalized_suffixes.find (capitalized_suffix) != capitalized_suffixes.end ())
      continue;
    // Ok, not found, but it could be this suffix is in all capitals. Handle that.
    initial = capitalized_suffix.substr (0, 1);
    capitalized_suffix.erase (0, 1);
    capitalized_suffix = capitalized_suffix.casefold();
    capitalized_suffix.insert (0, initial);
    if (capitalized_suffixes.find (capitalized_suffix) != capitalized_suffixes.end ())
      continue;
    // Ok, it appears we've got an error here -> give message.
    mixed_capitalization_message (word);
  }
  // Free memory
  g_object_unref (textbuffer);
}


void start_element_handler (GMarkupParseContext *context,
                            const gchar         *element_name,
                            const gchar        **attribute_names,
                            const gchar        **attribute_values,
                            gpointer             user_data,
                            GError             **error)
{
  current_element = element_name;
  if (current_element == BOOK_TAG) {
    // A book starts. Get the name of the book.    
    book = attribute_values[0];
  } 
  else if (current_element == CHAPTER_TAG) {
    // A chapter starts. Gets its number.
    chapter = convert_to_int (attribute_values[0]);
  } 
  else if (current_element == VERSE_TAG ) {
    // A verse starts. Store it.
    verse = attribute_values[0];
  }
}


void end_element_handler (GMarkupParseContext *context,
                          const gchar         *element_name,
                          gpointer             user_data,
                          GError             **error)
{
  current_element = element_name;
  if (current_element == BOOK_TAG) {
    // We've reached the end of the book.
    // Check all verse text.
    check_capitalization (verse_chapter, verse_verse, verse_text, verse_pointer, true);
    // Clear containers for verse text.
    verse_chapter.clear();
    verse_verse.clear();
    verse_text.clear();
    verse_pointer.clear();
  } else if (current_element == CHAPTER_TAG) {
    // We've reached the end of a chapter.
  } else if (current_element == VERSE_TAG ) {
    // We are at the end of a verse.
  }
}


void text_handler (GMarkupParseContext *context,
                   const gchar         *text,
                   gsize                text_len,
                   gpointer             user_data,
                   GError             **error)
{
  ustring utext (text);
  utext = trim (utext);
  if (!utext.empty()) {
    vector<int> chapters;
    vector<ustring> verses;
    vector<size_t> pointers;
    chapters.push_back (chapter);
    verses.push_back (verse);
    pointers.push_back (0);
    if (current_element == IDENTIFIER_TEXT_TAG) {
      // No checks done.
    }
    else if (current_element == INTRODUCTION_TEXT_TAG) {
      check_capitalization (chapters, verses, utext, pointers, false);
      check_suspicious_capitalization (utext);
    }
    else if (current_element == HEADING_TEXT_TAG) {
      check_capitalization (chapters, verses, utext, pointers, false);
      check_suspicious_capitalization (utext);
    }
    else if (current_element == CHAPTER_TEXT_TAG) {
      // No checks done.
    }
    else if (current_element == STUDY_NOTE_TEXT_TAG) {
      check_capitalization (chapters, verses, utext, pointers, true);
      check_suspicious_capitalization (utext);
    }
    else if (current_element == NOTE_TEXT_TAG) {
      check_capitalization (chapters, verses, utext, pointers, true);
      check_suspicious_capitalization (utext);
    }
    else if (current_element == CROSSREFERENCE_TEXT_TAG) {
      check_capitalization (chapters, verses, utext, pointers, true);
      check_suspicious_capitalization (utext);
    }
    else if (current_element == VERSE_TEXT_TAG) {
      // Store verse text for checking at the end of the book. We cannot check 
      // per verse or chapter, because sentences could span them.
      verse_chapter.push_back (chapter);
      verse_verse.push_back (verse);
      verse_pointer.push_back (verse_text.length());
      if (!verse_text.empty())
        verse_text.append (" ");
      verse_text.append (utext);
      // Check suspicious capitalization in the text.
      check_suspicious_capitalization (utext);
    }
  }
}



void passthrough_handler    (GMarkupParseContext *context,
                             const gchar         *passthrough_text,
                             gsize                text_len,
                             gpointer             user_data,
                             GError             **error)
{
}


void error_handler          (GMarkupParseContext *context,
                             GError              *error,
                             gpointer             user_data)
{
  cerr << error->message << endl;
}


int main (int argc, char *argv[])
{
  // Initialize GTK
  gtk_init (&argc, &argv);
  // Information provided when no arguments are given.
  if (argc == 1) {
    cout << "sc-capitalization reads checking units from stdin," << endl;
    cout << "verifies the capitalization in that text, and outputs its report on stdout." << endl;
    cout << "Commandline arguments:" << endl;
    cout << "--punctuation-followed-by-capitals <characters>" << endl;
    cout << "  A list of characters without spaces." << endl;
    cout << "--ignore-lowercase-following <characters>" << endl;
    cout << "  A list of characters without spaces." << endl;
    cout << "--abbreviations <filename>" << endl;
    cout << "  A file containing abbreviations, one per line" << endl;
    cout << "--uncapitalized-prefixes <filename>" << endl;
    cout << "  A file containing prefixes, one per line" << endl;
    cout << "--any-prefixes" << endl;
    cout << "  Any uncapitalized prefixes are allowed" << endl;
    cout << "  A file containing suffixes, one per line" << endl;
    return 0;
  }
  // Process command line arguments.
  allow_any_uncapitalized_prefixes = false;
  for (int i = 1; i < argc; i++) {
    ustring argument;
    argument = argv[i];
    if (argument.length() > 2) {
      if (argument.substr (0, 2) == "--") {
        argument.erase (0, 2);
        if (argument == "punctuation-followed-by-capitals") {
          argument = argv[++i];
          for (unsigned int i2 = 0; i2 < argument.length(); i2++) {
            punctuation_followed_by_capitals.insert (g_utf8_get_char (argument.substr(i2, 1).c_str()));
          }
        }
        if (argument == "ignore-lowercase-following") {
          argument = argv[++i];
          for (unsigned int i2 = 0; i2 < argument.length(); i2++) {
            ignore_lower_case_following.insert (g_utf8_get_char (argument.substr(i2, 1).c_str()));
          }
        }
        if (argument == "abbreviations") {
          argument = argv[++i];
          ReadText rt (argument, true);
          for (unsigned int i2 = 0; i2 < rt.lines.size(); i2++) {
            abbreviations.insert (rt.lines[i2]);
          }
        }
        if (argument == "uncapitalized-prefixes") {
          argument = argv[++i];
          ReadText rt (argument, true);
          for (unsigned int i2 = 0; i2 < rt.lines.size(); i2++) {
            uncapitalized_prefixes.insert (rt.lines[i2]);
          }
        }
        if (argument == "any-prefixes") {
          allow_any_uncapitalized_prefixes = true;
        }
        if (argument == "capitalized-suffixes") {
          argument = argv[++i];
          ReadText rt (argument, true);
          for (unsigned int i2 = 0; i2 < rt.lines.size(); i2++) {
            capitalized_suffixes.insert (rt.lines[i2]);
          }
        }
      }
    }
  }
  // Read data from stdin.
  GIOChannel* io;
  gchar* text;
  gsize length;
  io = g_io_channel_unix_new (0);
  g_io_channel_read_to_end (io, &text, &length, NULL);
  // Set up parser.
  GMarkupParseContext *context;
  GMarkupParser parser = {
    start_element_handler,
    end_element_handler,
    text_handler,
    passthrough_handler,
    error_handler
  };
  // Parse xml data.
  context = g_markup_parse_context_new (&parser, GMarkupParseFlags (0), NULL, NULL);
  g_markup_parse_context_parse (context, text, length, NULL);
  g_markup_parse_context_end_parse (context, NULL);
  // Free some resources.  
  g_markup_parse_context_free (context);
  g_free (text);
  g_io_channel_unref (io);
  // Ready.
  return 0;
}

Generated by  Doxygen 1.6.0   Back to index