Logo Search packages:      
Sourcecode: bibledit version File versions  Download package

input-usfm.cpp

/*
    Copyright (C) 2003-2006 Teus Benschop.

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

*/


#include "libraries.h"
#include "utilities.h"
#include "constants.h"


// Variables.
ustring bookname;
ustring chapternumber;
ustring versenumber;
bool categorize = false;
bool no_text = false;


void open_close_book (bool open);
void open_close_chapter (bool open);
void open_close_verse (bool open);
void output_content (const ustring& marker, const ustring& content);
void output_identifiers (const ustring& marker, ustring& line);
void output_introduction (const ustring& marker, ustring& line);
void output_headings (const ustring& marker, ustring& line);
void output_chapters (const ustring& marker, ustring& line);
void output_extended_study_notes (const ustring& marker, ustring& line);
void output_foot_end_notes (ustring& line);
void output_notes (ustring& line, const ustring& opening_marker, const ustring& closing_marker);
void output_crossreferences (ustring& line);
void output_verse_text (ustring& line);
void clear_out_any_marker (ustring& line);


void open_close_book (bool open)
// Opens (or closes) a book. Also deals with its children.
{
  static bool book_opened = false;
  if (open) {
    // Open book.
    if (!book_opened) {
      cout << "<" << BOOK_TAG << " " << NAME_PROPERTY << "=\"" << bookname << "\">" << endl;
      book_opened = true;
    }
  } else {
    // Close book.
    if (book_opened) {
      open_close_chapter (false);
      cout << xml_tag (0, BOOK_TAG, true) << endl;
      book_opened = false;
    }
  }
}


void open_close_chapter (bool open)
// Opens (or closes) a chapter. Also deals with its child and parent.
{
  static bool chapter_opened = false;
  if (open) {
    // Open a chapter.
    if (!chapter_opened) {
      open_close_book (true);
      cout << "  <" << CHAPTER_TAG << " " << NUMBER_PROPERTY << "=\"" << chapternumber << "\">" << endl;
      chapter_opened = true;
    }
  } else {
    // Close a chapter.
    if (chapter_opened) {
      open_close_verse (false);
      cout << xml_tag (1, CHAPTER_TAG, true) << endl;
      chapter_opened = false;
    }
  }
}


void open_close_verse (bool open)
// Opens (or closes) a verse. No children to deal with. But parents.
{
  static bool verse_opened = false;
  if (open) {
    // Open a verse.
    if (!verse_opened) {
      open_close_chapter (true);
      cout << "    <" << VERSE_TAG << " " << NUMBER_PROPERTY << "=\"" << versenumber << "\">" << endl;
      verse_opened = true;
    }
  } else {
    // Close a verse.
    if (verse_opened) {
      cout << xml_tag (2, VERSE_TAG, true) << endl;
      verse_opened = false;
    }
  }
}


void output_content (const ustring& marker, const ustring& content)
// Ouputs any other content besides book/chapter/verse. Deals with its parent.
{
  open_close_verse (true);
  // If we don't want any text, stop here.
  if (no_text)
    return;
  if (categorize) {
    // Subdivide the data into categories.
    // Each category removes something from this content, until it is empty.
    ustring content2 (content);
    // Deal with identifiers.
    output_identifiers (marker, content2);
    // Deal with introduction elements.
    output_introduction (marker, content2);
    // Deal with headings, titles, labels.
    output_headings (marker, content2);
    // Deal with chapter text.
    output_chapters (marker, content2);
    // Deal with extended study notes. As these use the existing footnote
    // markers, deal with the study notes first.
    output_extended_study_notes (marker, content2);      
    // Deal with footnotes, endnotes.
    output_foot_end_notes (content2);
    // Deal with crossreferences.
    output_crossreferences (content2);
    // After everything else has been removed, output the rest as main text.
    // This includes the "Verses" group, the "Paragraph Elements", and the
    // "Poetry Elements", the "Table Elements", and the "Special Text and
    // Character Styles", which have been filtered out already above.
    output_verse_text (content2);
  } else {
    // No categorization: just dump all data to stdout (fd: 1).
    // Normally we would have written this code: cout << content << endl;
    // But as cout behaves unpredictably when \f (the footnote usfm marker) is 
    // sent to it, we cannot use it here, and instead use the write call.
    // Cout also has problems with certain Unicode characters.
    write (1, "\\", 1);
    write (1, marker.c_str(), strlen (marker.c_str()));
    if (!content.empty()) {
      write (1, " ", 1);
      gchar  * escaped_text;
      escaped_text = g_markup_escape_text (content.c_str(), strlen (content.c_str()));
      write (1, escaped_text, strlen (escaped_text));
      g_free (escaped_text);
    }
    write (1, "\n", 1);
  }
}


void output_identifiers (const ustring& marker, ustring& line)
{
  if (line.empty())
    return;
  if ( (marker == "id")
    || (marker == "ide")
    || (marker == "rem")
    || (marker == "h")
    || (marker == "h1")
    || (marker == "h2")
    || (marker == "h3")
    || (marker == "toc")
    || (marker == "toc1")
    || (marker == "toc2")
  ) {
    clear_out_any_marker (line);
    ustring xml = xml_text_embed_in_tags (3, IDENTIFIER_TEXT_TAG, line);
    write (1, xml.c_str(), strlen (xml.c_str()));
    write (1, "\n", 1);
    line.clear();
  }
}


void output_introduction (const ustring& marker, ustring& line)
{
  if (line.empty())
    return;
  if ( (marker == "imt")
    || (marker == "imt1")
    || (marker == "imt2")
    || (marker == "imt3")
    || (marker == "imt4")
    || (marker == "is")
    || (marker == "is1")
    || (marker == "is2")
    || (marker == "ip")
    || (marker == "ipi")
    || (marker == "im")
    || (marker == "imi")
    || (marker == "ipq")
    || (marker == "imq")
    || (marker == "ipr")
    || (marker == "ib")
    || (marker == "iex")
    || (marker == "iq")
    || (marker == "iq1")
    || (marker == "iq2")
    || (marker == "iq3")
    || (marker == "iot")
    || (marker == "io")
    || (marker == "io")
    || (marker == "io1")
    || (marker == "io2")
    || (marker == "io3")
    || (marker == "io4")
    || (marker == "imte")
    || (marker == "ie")
  ) {
    clear_out_any_marker (line);
    ustring xml = xml_text_embed_in_tags (3, INTRODUCTION_TEXT_TAG, line);
    write (1, xml.c_str(), strlen (xml.c_str()));
    write (1, "\n", 1);
    line.clear();
  }
}


void output_headings (const ustring& marker, ustring& line)
{
  if (line.empty())
    return;
  if ( (marker == "mt")
    || (marker == "mt1")
    || (marker == "mt2")
    || (marker == "mt3")
    || (marker == "mt4")
    || (marker == "mte")
    || (marker == "mte1")
    || (marker == "mte2")
    || (marker == "mte3")
    || (marker == "mte4")
    || (marker == "ms")
    || (marker == "ms1")
    || (marker == "ms2")
    || (marker == "ms3")
    || (marker == "ms4")
    || (marker == "mr")
    || (marker == "s")
    || (marker == "s1")
    || (marker == "s2")
    || (marker == "s3")
    || (marker == "s4")
    || (marker == "sr")
    || (marker == "r")
    || (marker == "d")
    || (marker == "sp")
  ) {
    clear_out_any_marker (line);
    ustring xml = xml_text_embed_in_tags (3, HEADING_TEXT_TAG, line);
    write (1, xml.c_str(), strlen (xml.c_str()));
    write (1, "\n", 1);
    line.clear();
  }
}


void output_chapters (const ustring& marker, ustring& line)
{
  if (line.empty())
    return;
  if ( (marker == "c")
    || (marker == "ca")
    || (marker == "cl")
    || (marker == "cp")
    || (marker == "cd")
  ) {
    clear_out_any_marker (line);
    ustring xml = xml_text_embed_in_tags (3, CHAPTER_TEXT_TAG, line);
    write (1, xml.c_str(), strlen (xml.c_str()));
    write (1, "\n", 1);
    line.clear();
  }
}


void output_extended_study_notes (const ustring& marker, ustring& line)
{
  if (line.empty())
    return;
  if ( (marker == "env")
    || (marker == "enw")
    || (marker == "enk")
    || (marker == "enc")
  ) {
    clear_out_any_marker (line);
    ustring xml = xml_text_embed_in_tags (3, STUDY_NOTE_TEXT_TAG, line);
    write (1, xml.c_str(), strlen (xml.c_str()));
    write (1, "\n", 1);
    line.clear();
  }
}


void output_foot_end_notes (ustring& line)
{
  output_notes (line, "\\f ", "\\f*");
  output_notes (line, "\\fe ", "\\fe*");
}


void output_notes (ustring& line, const ustring& opening_marker, const ustring& closing_marker)
{
  size_t beginposition, endposition;
  beginposition = line.find (opening_marker);
  endposition = line.find (closing_marker);
  while ((beginposition != string::npos) && (endposition != string::npos)) {
    ustring notetext;
    notetext = line.substr (beginposition + opening_marker.length(), endposition - beginposition - closing_marker.length());
    line.erase (beginposition, endposition - beginposition + closing_marker.length());
    clear_out_any_marker (notetext);
    ustring xml = xml_text_embed_in_tags (3, NOTE_TEXT_TAG, notetext);
    write (1, xml.c_str(), strlen (xml.c_str()));
    write (1, "\n", 1);
    beginposition = line.find (opening_marker);
    endposition = line.find (closing_marker);
  }
}


void output_crossreferences (ustring& line)
{
  if (line.empty())
    return;
  size_t beginposition, endposition;
  ustring opening_marker = "\\x ";
  ustring closing_marker = "\\x*";
  beginposition = line.find (opening_marker);
  endposition = line.find (closing_marker);
  while ((beginposition != string::npos) && (endposition != string::npos)) {
    ustring referencetext;
    referencetext = line.substr (beginposition + opening_marker.length(), endposition - beginposition - closing_marker.length());
    line.erase (beginposition, endposition - beginposition + closing_marker.length());
    clear_out_any_marker (referencetext);
    ustring xml = xml_text_embed_in_tags (3, CROSSREFERENCE_TEXT_TAG, referencetext);
    write (1, xml.c_str(), strlen (xml.c_str()));
    write (1, "\n", 1);
    beginposition = line.find (opening_marker);
    endposition = line.find (closing_marker);
  }
}


void output_verse_text (ustring& line)
{
  if (line.empty())
    return;
  clear_out_any_marker (line);
  ustring xml;
  xml = xml_text_embed_in_tags (3, VERSE_TEXT_TAG, line);
  // We would have written: cout << xml << endl;
  // But this gives problems with unicode characters. The write call works well.
  write (1, xml.c_str(), strlen (xml.c_str()));
  write (1, "\n", 1);
  line.clear();
}


void clear_out_any_marker (ustring& line)
{
  size_t startpos = 0;
  startpos = line.find ("\\", startpos);
  while (startpos != string::npos) {
    ustring marker;
    size_t endpos = line.find_first_of (" *", startpos);
    if (endpos == string::npos) {
      marker = line.substr (startpos + 1, line.length() - startpos);
    } else {
      marker = line.substr (startpos + 1, endpos - startpos - 1);
    }
    line.erase (startpos, marker.length() + 2);
    startpos++;
    startpos = line.find ("\\", startpos);
  }
}


void split_line_on_marker (ustring marker, ustring& line)
// Inserts newlines just before amy place where marker occurs.
{
  marker.insert (0, "\\");
  marker.append (" ");
  size_t position = 0;
  position = line.find (marker, position);
  while (position != string::npos) {
    line.insert (position, "\n");
    position++;
    position++;
    position = line.find (marker, position);
  }
}


#define SPLIT_MARKER_COUNT 145
char *markers_that_split[SPLIT_MARKER_COUNT] =
{ "id",  "ide",  "rem",  "h",  "h1",  "h2",  "h3",  "toc1",  "toc2",  "imt",  
  "imt1",  "imt2",  "imt3",  "imt4",  "is",  "is1",  "is2",  "is3",  "is4",  
  "ip",  "ipi",  "im",  "imi",  "ipq",  "imq",  "ipr",  "ib",  "iex",  "iq",  
  "iq1",  "iq2",  "iq3",  "iq4",  "iot",  "io",  "io1",  "io2",  "io3",  "io4",  
  "imte",  "ie",  "mt",  "mt1",  "mt2",  "mt3",  "mt4",  "mte",  "mte1",  
  "mte2",  "mte3",  "mte4",  "ms",  "ms1",  "ms2",  "ms3",  "ms4",  "mr",  "s",
  "s1",  "s2",  "s3",  "s4",  "sr",  "r",  "d",  "sp",  "c",  "ca",  "cl",  
  "cp",  "cd",  "v",  "p",  "m",  "pmo",  "pm",  "pmc",  "pmr",  "pi",  "pi1",
  "pi2",  "pi3",  "pi4",  "mi",  "nb",  "cls",  "li",  "li1",  "li2",  "li3", 
  "li4",  "pc",  "pr",  "ph",  "ph1",  "ph2",  "ph3",  "ph4",  "b",  "q",  "q1",
  "q2",  "q3",  "q4",  "qr",  "qc",  "qs",  "qa",  "qm",  "qm1",  "qm2",  "qm3",
  "qm4",  "tr",  "th1",  "th2",  "th3",  "th4",  "thr1",  "thr2",  "thr3", 
  "thr4",  "tc1",  "tc2",  "tc3",  "tc4",  "tcr1",  "tcr2",  "tcr3",  "tcr4", 
  "env",  "enw",  "enk",  "enc",  "lit",  "pb",  "pub",  "pref",  "intro", 
  "conc",  "glo",  "idx",  "maps",  "cov",  "spine",    
};


int main (int argc, char *argv[])
// Read the USFM files, convert it to the format used internally
// by all the checks, and dump the result to stdout.
{
  if (argc == 1) {
    cout << "sc-input-usfm reads the USFM files given as arguments on the commandline" << endl;
    cout << "and outputs them on stdout in a format used internally by all the checks." << endl;
    cout << "E.g.:" << endl;
    cout << "<book name=\"3 John\">" << endl;
    cout << "  <chapter number=\"1\">" << endl;
    cout << "  </chapter>" << endl;
    cout << "    <verse number=\"1\">" << endl;
    cout << "\v 1 The elder unto the beloved Gaius" << endl;
    cout << "    </verse>" << endl;
    cout << "</book>" << endl;
    cout << "Parameters:" << endl;
    cout << "--categorize" << endl;
    cout << "  This outputs the data in categories, like Identification, Introduction, etc." << endl;
    cout << "--no-text" << endl;
    cout << "  Outputs no text, only the book, chapter- and verse numbers." << endl;
    return 0;
  }
  
  // The input files to read.
  vector<ustring> input_files;
  
  // Process command line arguments.
  for (int i = 1; i < argc; i++) {
    ustring argument;
    argument = argv[i];
    if (argument.length() > 2) {
      if (argument.substr (0, 2) == "--") {
        argument.erase (0, 2);
        if (argument == "categorize") {
          categorize = true;
        }
        if (argument == "no-text") {
          no_text = true;
        }
        continue;
      }
    }
    input_files.push_back (argument);
  }
  // Variables for our use.
  ustring text;
  text.clear();
  ustring previous_marker;
  // Markers that split a line.
  set<ustring> splitters;
  for (unsigned int i = 0; i < SPLIT_MARKER_COUNT; i++)
    splitters.insert (markers_that_split[i]);
  // Read the files and go through them.
  for (unsigned int fc = 0; fc < input_files.size(); fc++) {
    // Storage for cleaned up lines.
    vector<ustring> lines;
    // Read the text.
    ReadText rt (input_files[fc], true, true);
    for (unsigned int i = 0; i < rt.lines.size(); i++) {
      // Change tabs to spaces.
      size_t position;
      position = rt.lines[i].find ("\t");
      while (position != string::npos) {
        rt.lines[i].replace (position, 1, " ");
        position = rt.lines[i].find ("\t", position);
      }
      // Change multiple spaces to one space.
      position = rt.lines[i].find ("  ");
      while (position != string::npos) {
        rt.lines[i].replace (position, 2, " ");
        position = rt.lines[i].find ("  ");
      }
      // Cut into handsome lines, according to the USFM standard.
      vector<ustring> markers;
      markers = usfm_get_all_markers (rt.lines[i]);
      for (unsigned int i2 = 0; i2 < markers.size(); i2++) {
        if (splitters.find (markers[i2]) != splitters.end())
          split_line_on_marker (markers[i2], rt.lines[i]);
      }
      // Add a line without USFM to the previous one.
      if (markers.empty()) {
        lines[lines.size() - 1].append (" ");
        lines[lines.size() - 1].append (rt.lines[i]);
        continue;
      }
      // Separate and trim these lines.
      position = rt.lines[i].find ("\n");
      while (position != string::npos) {
        ustring line = rt.lines[i].substr (0, position);
        line = trim (line);
        if (!line.empty())
          lines.push_back (line);
        rt.lines[i].erase (0, ++position);
        position = rt.lines[i].find ("\n");
      }
      if (!rt.lines[i].empty())
        lines.push_back (trim (rt.lines[i]));
    }
    // Initialize some variables.
    bookname = "Unknown";
    chapternumber = "0";
    versenumber = "0";
    for (unsigned int i = 0; i < lines.size(); i++) {
      // Split the line on the main markers and handle them all.
      vector<ustring> split_lines;
      split_lines = usfm_split_on_main_markers (lines[i]);
      for (unsigned int i2 = 0; i2 < split_lines.size(); i2++) {
        // Extract the marker, and deal with it.
        ustring marker;
        marker = usfm_extract_marker (split_lines[i2]);
        if (marker == "id") {
          // ID marker found.
          ustring id = split_lines[i2].substr (0, 3);
          bookname = id_to_biblebook_english (id);
          open_close_book (true);
          output_content (marker, split_lines[i2]);
        } else if (marker == "c") {
          // Chapter found.
          // Close any previous chapter.
          open_close_chapter (false);
          // Open new chapter.
          chapternumber = number_in_string (split_lines[i2]);
          open_close_chapter (true);
          versenumber = "0";
          output_content (marker, split_lines[i2]);
        } else if (marker == "v") {
          // Verse found.
          // Close any previous verse.
          open_close_verse (false);
          // Get the verse number.
          versenumber = split_lines[i2];
          // Make it robust, even handling cases like:
          // - \v 1-2“Moi - No space after verse number.
          size_t spaceposition = versenumber.find_first_not_of ("0123456789,-ab");
          if (spaceposition != string::npos) {
            versenumber = versenumber.substr (0, spaceposition);
          }
          // Handle case the usfm file does not contain the \c 1.
          if ((versenumber != "0") && (chapternumber == "0")) {
            open_close_chapter (false);
            chapternumber = "1";
            open_close_chapter (true);
          }
          // Open the verse number.
          open_close_verse (true);
          // Add remaining content.
          output_content (marker, split_lines[i2]);
        } else if (marker.empty()) {
          // No marker found.
          output_content (previous_marker, split_lines[i2]);
        } else {
          // Any other marker found.
          output_content (marker, split_lines[i2]);
        }
        previous_marker = marker;
      }
    }
    // We reached the end: close book (and its children).
    open_close_book (false);
  }
  return 0;
}

Generated by  Doxygen 1.6.0   Back to index