Logo Search packages:      
Sourcecode: bibledit version File versions  Download package

usfmtools.cpp

/*
** Copyright (C) 2003-2006 Teus Benschop.
**  
** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 2 of the License, or
** (at your option) any later version.
**  
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
** GNU General Public License for more details.
**  
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software
** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
**  
*/


#include "libraries.h"
#include "utilities.h"
#include "usfm.h"
#include "usfmtools.h"


ustring usfm_extract (ustring & line)
// This returns the usfm from the line, e.g. \id.
// The usfm is removed from the line.
{
  ustring s = trim (line);
  ustring returnvalue = "";
  if (s.length () > 1) {
    if (s.substr (0, 1) == "\\") {
      // Sometimes markers are not followed by a space, but by another marker.
      // So check for the backslash too.
      // Sometimes markers are followed by e.g. \x immediately, without a space
      // in between. This \x has been transformed to fo:inline code already, so
      // check for the "<" character too.
      size_t endposition = s.find_first_of (" <\\", 1);
      if (endposition == string::npos) {
        line.clear();
        returnvalue = s;
      }
      else {
        returnvalue = s.substr (0, endposition);
        line.erase (0, endposition);
        line = trim (line);
      }
    }
  }
  return returnvalue;
}


ustring usfm_extract_marker (ustring & line)
// Returns the usfm marker from the line, but without
// the first backslash, so e.g. id.
// The backslash and the usfm marker are removed from the line.
{
  ustring returnvalue = usfm_extract (line);
  if (returnvalue.length () > 0)
    returnvalue.erase (0, 1);   // Remove backslash.
  return returnvalue;
}


int usfm_search_marker_from (const ustring & line, unsigned int from)
// Searches for any usfm (whether it exists or not).
// It starts searching at position "from".
// It returns the position where it found the usfm, or string::npos is not found.
{
  return line.find ("\\", from);
}


unsigned int usfm_search_marker_from_length (const ustring & line, unsigned int from)
// Returns the length of the usfm that is found at position "from".
{
  ustring s;
  s = line.substr (from, line.length ());
  return usfm_extract (s).length ();
}


bool usfm_search_marker (const ustring& line, ustring& marker, size_t& position, size_t& length, bool& opening_marker)
/*
Searches for any marker in "line". 
Returns its position when found or string::npos if not found.
Puts the marker itself, without backslashes and all, into "marker".
If it is an opening marker, sets "opening_marker" true.
E.g. this is a line:
  And \nd God\nd* said:
The variables will be set so:
marker = nd;
position = 4;
length = 4;
opening_marker = true;
*/
{
  // Search for the marker.
  position = line.find ("\\");
  // Not found: bail out.
  if (position == string::npos)
    return false;
  // Deal with the rest of the line.
  ustring remaining_part;
  remaining_part = line.substr (position, line.length() - position);
  if (!remaining_part.empty())
    remaining_part.erase (0, 1);
  // Get end of marker.
  size_t endposition = remaining_part.find_first_of (" *\\");  
  // No endposition found: the marker is at the end of the line.
  if (endposition == string::npos)
    endposition = remaining_part.length();
  // We've a marker -> set other variables.
  marker = remaining_part.substr (0, endposition);
  opening_marker = (remaining_part.substr (endposition, 1) == " ");
  if (remaining_part.substr (endposition, 1) == "\\") {
    length = endposition + 1;
  } else {
    length = endposition + 2;
  }
  if (endposition == remaining_part.length())
    length = endposition + 1;
  return true;
}


vector<ustring> usfm_get_all_markers (const ustring& line)
// Returns all markers that are found in the line.
{
  vector<ustring> markers;
  ustring marker;
  ustring line2 (line);
  marker = usfm_extract_marker_within_line (line2);
  while (!marker.empty()) {
    markers.push_back (marker);
    marker = usfm_extract_marker_within_line (line2);
  }
  return markers;
}


ustring usfm_extract_marker_within_line (ustring & line)
/*
Returns the usfm marker from the line, but without the first backslash, e.g. id.
The backslash and the usfm marker are removed from the line.
It searches within the line too.
*/
{
  ustring returnvalue = trim(usfm_extract_within_line (line));
  if (returnvalue.length () > 0)
    returnvalue.erase (0, 1);   // Remove backslash.
  return returnvalue;
}


ustring usfm_extract_within_line (ustring & line)
/*
This returns the usfm from the line, e.g. \id.
The usfm is removed from the line.
*/
{
  line = trim (line);
  ustring returnvalue;
  size_t offposition;
  offposition = line.find ("\\");
  if (offposition != string::npos) {
    line.erase (0, offposition);
    size_t endposition;
    endposition = line.find_first_of (" *", 1);
    if (endposition != string::npos) {
      returnvalue = line.substr (0, ++endposition);
      line.erase (0, endposition);
    } else {
      returnvalue = line;
      line.clear();
    }
  }
  return returnvalue;
}


void split_line_on_marker (ustring marker, ustring& line)
// Inserts newlines just before amy place where "marker" occurs.
{
  marker.insert (0, "\\");
  marker.append (" ");
  size_t position = 0;
  position = line.find (marker, position);
  while (position != string::npos) {
    line.insert (position, "\n");
    position++;
    position++;
    position = line.find (marker, position);
  }
}


ustring get_usfm_id_from_file (const ustring & filename)
// Gets the Paratext ID from a file.
// Deals with Windows "feature" to put \xEF\xBB\xBF at the start of a textfile.
{
  // If the file is too short, probably no USfM file.
  if (file_get_size (filename) < 50) 
    return "";
  // Get contents.
  gchar *contents;
  g_file_get_contents (filename.c_str(), &contents, NULL, NULL);
  // If we got nothing, then there is no ID.
  if (!contents)
    return "";
  // Clear contents up.
  contents = de_windows_notepad (contents);
  gchar *contents_shortened = g_strndup (contents, 100);
  ParseLine parseline (contents_shortened);
  g_free (contents);
  g_free (contents_shortened);
  // Look for an ID.
  for (unsigned int i = 0; i < parseline.lines.size(); i++) {
    ustring line = trim (parseline.lines[i]);
    ustring marker = usfm_extract_marker (line);
    if (usfm_is_id (marker)) {
      try 
      {
        return line.substr (0, 3);
      }
      catch (exception & ex)
      {
        cerr << ex.what () << endl;
      }
    }
  }
  return "";
}


void split_line_into_usfm_and_text (const ustring& line, ustring& marker, ustring& text)
/*
Splits all text into marker and text. 
E.g. "\v 1 In the beginning ..." is split into:
- \v 1 
- In the beginning ...
*/
{
  // Do the first split.
  text = line;
  marker = usfm_extract_marker (text);
  // If this line did not have any usfm, we're through.
  if (marker.empty())
    return;
  // Certain usfms, like for chapter and verse, need to have the numbers too.
  if (usfm_is_chapter (marker) || usfm_is_verse (marker)) {
    ustring number;
    number = number_in_string (text);
    text.erase (0, number.length());
    text = trim (text);
    marker.append (" ");
    marker.append (number);
  }
  // The backslash was extracted, so put it back.
  marker.insert(0, "\\");
}


void usfm_handle_inline_text (ustring& line, UsfmInlineMarkers * inline_markers, XmlFoBlock * block, bool sword)
/*
1. Usfm text that has the opening marker, but not the closing marker, may result
   in malformed xslfo output. To ensure this does not occur, the opening tag
   is only placed in the xslfo output when the closing usfm is found too.
2. For inline text markers, the space following the opening marker is
   part of the marker, and not part of the word.
*/
{
  // Go through all markers, and look for them in the line.
  for (unsigned int i = 0; i < inline_markers->opening_markers.size(); i++) {
    // Allow for a marker to occur more than once in the line.
    bool still_working = true;
    while (still_working) {
      string::size_type begin_position;
      begin_position = line.find(inline_markers->opening_markers[i]);
      if (begin_position != string::npos) {
        string::size_type end_position;
        end_position = line.find(inline_markers->closing_markers[i]);
        if (end_position != string::npos) {
          // At this stage we have both the opening and closing marker, so
          // we're safe to replace the markers with the xslfo tags.
          string::size_type position;
          string::size_type length;
          position = begin_position;
          length = inline_markers->opening_markers[i].length();
          ustring openingmarkers;
          if (sword)
            openingmarkers = inline_markers->opening_sword_markup (i);
          else 
            openingmarkers = inline_markers->opening_xslfo_markup (block, i);
          line.replace (position, length, openingmarkers);
          position = line.find(inline_markers->closing_markers[i]);
          length = inline_markers->closing_markers[i].length();
          if (sword)
            line.replace (position, length, inline_markers->closing_sword_markup);
          else
            line.replace (position, length, inline_markers->closing_xslfo_markup);
        } else still_working = false;
      } else still_working = false;
    }
  }
}


void usfm_remove_inline_text_markers (ustring& line, UsfmInlineMarkers * inline_markers)
// Removes inline text markers.
{
  // Go through all markers, and remove them from the line.
  for (unsigned int i = 0; i < inline_markers->opening_markers.size(); i++) {
    if (line.find (inline_markers->opening_markers[i]) != string::npos) {
      replace_text (line, inline_markers->opening_markers[i], "");
      replace_text (line, inline_markers->closing_markers[i], "");
    }
  }
}


#define NUMBER_OF_MARKERS_IdentificationInformation 9
char *USFM_MARKERS_IdentificationInformation [NUMBER_OF_MARKERS_IdentificationInformation] =
{ 
  "id", "ide", "rem", "h", "h1", "h2", "h3", "toc1", "toc2"
};


#define NUMBER_OF_MARKERS_IntroductionTitlesHeadings 11
char *USFM_MARKERS_IntroductionTitlesHeadings [NUMBER_OF_MARKERS_IntroductionTitlesHeadings] =
{ 
  "imt", "imt1", "imt2", "imt3", "imt4", "is", "is1", "is2", "is3", "is4", "imte"
};


#define NUMBER_OF_MARKERS_IntroductionParagraphsPoetry 14
char *USFM_MARKERS_IntroductionParagraphsPoetry [NUMBER_OF_MARKERS_IntroductionParagraphsPoetry] =
{ 
  "ip", "ipi", "im", "imi", "imq", "ipr", "ipq", "ib", "iex", "iq", "iq1", "iq2", "iq3", "iq4"
};


#define NUMBER_OF_MARKERS_IntroductionOtherElements 8
char *USFM_MARKERS_IntroductionOtherElements [NUMBER_OF_MARKERS_IntroductionOtherElements] =
{ 
  "iot", "io", "io1", "io2", "io3", "io4", "ior", "ie"
};


#define NUMBER_OF_MARKERS_Titles 11
char *USFM_MARKERS_Titles [NUMBER_OF_MARKERS_Titles] =
{ 
  "mt", "mt1", "mt2", "mt3", "mt4", "mte", "mte1", "mte2", "mte3", "mte4", "d"
};


#define NUMBER_OF_MARKERS_Headings 14
char *USFM_MARKERS_Headings [NUMBER_OF_MARKERS_Headings] =
{ 
  "ms", "ms1", "ms2", "ms3", "ms4", "mr", "s", "s1", "s2", "s3", "s4", "sr", "r", "sp"
};


#define NUMBER_OF_MARKERS_ChaptersAndVerses 8
char *USFM_MARKERS_ChaptersAndVerses [NUMBER_OF_MARKERS_ChaptersAndVerses] =
{ 
  "c", "ca", "cl", "cp", "cd", "v", "va", "vp"
};


#define NUMBER_OF_MARKERS_Paragraphs 22
char *USFM_MARKERS_Paragraphs [NUMBER_OF_MARKERS_Paragraphs] =
{ 
  "p", "m", "pmo", "pm", "pmc", "pmr", "pi", "pi1", "pi2", "pi3", "pi4", "mi", "nb", "cls", "b", "pc", "pr", "ph", "ph1", "ph2", "ph3", "ph4"
};


#define NUMBER_OF_MARKERS_Lists 5
char *USFM_MARKERS_Lists [NUMBER_OF_MARKERS_Lists] =
{ 
  "li", "li1", "li2", "li3", "li4"
};


#define NUMBER_OF_MARKERS_PoetryElements 16
char *USFM_MARKERS_PoetryElements [NUMBER_OF_MARKERS_PoetryElements] =
{ 
  "q", "q1", "q2", "q3", "q4", "qr", "qc", "qs", "qa", "qac", "qm", "qm1", "qm2", "qm3", "qm4", "b"
};


#define NUMBER_OF_MARKERS_TableElements 25
char *USFM_MARKERS_TableElements [NUMBER_OF_MARKERS_TableElements] =
{ 
  "tr", "th", "th1", "th2", "th3", "th4", "th5", "thr", "thr1", "thr2", "thr3", "thr4", "thr5", "tc", "tc1", "tc2", "tc3", "tc4", "tc5", "tcr", "tcr1", "tcr2", "tcr3", "tcr4", "tcr5"
};


#define NUMBER_OF_MARKERS_Footnotes 12
char *USFM_MARKERS_Footnotes [NUMBER_OF_MARKERS_Footnotes] =
{ 
  "f", "fe", "fr", "fk", "fq", "fqa", "fl", "fp", "fv", "ft", "fdc", "fm"
};


#define NUMBER_OF_MARKERS_CrossReferences 6
char *USFM_MARKERS_CrossReferences [NUMBER_OF_MARKERS_CrossReferences] =
{ 
  "x", "xo", "xk", "xq", "xt", "xdc"
};


#define NUMBER_OF_MARKERS_ExtendedStudyNotes 9
char *USFM_MARKERS_ExtendedStudyNotes [NUMBER_OF_MARKERS_ExtendedStudyNotes] =
{ 
  "env", "enw", "enk", "enc", "cat1", "cat2", "cat3", "cat4", "cat5"
};


#define NUMBER_OF_MARKERS_SpecialText 13
char *USFM_MARKERS_SpecialText [NUMBER_OF_MARKERS_SpecialText] =
{ 
  "qt", "nd", "tl", "dc", "bk", "sig", "pn", "wj", "k", "sls", "ord", "add", "lit"
};


#define NUMBER_OF_MARKERS_CharacterStyles 6
char *USFM_MARKERS_CharacterStyles [NUMBER_OF_MARKERS_CharacterStyles] =
{ 
  "no", "bd", "it", "bdit", "em", "sc"
};


#define NUMBER_OF_MARKERS_SpacingsAndBreaks 1
char *USFM_MARKERS_SpacingsAndBreaks [NUMBER_OF_MARKERS_SpacingsAndBreaks] =
{ 
  "pb"
};


#define NUMBER_OF_MARKERS_SpecialFeatures 6
char *USFM_MARKERS_SpecialFeatures [NUMBER_OF_MARKERS_SpecialFeatures] =
{ 
  "fig", "pro", "w", "wh", "wg", "ndx"
};


#define NUMBER_OF_MARKERS_PeripheralMaterials 15
char *USFM_MARKERS_PeripheralMaterials [NUMBER_OF_MARKERS_PeripheralMaterials] =
{ 
  "pub", "toc", "pref", "intro", "conc", "glo", "idx", "maps", "cov", "spine", "k1", "k2", "p1", "p2", "pubinfo"
};


/*
Other markers that may occur in the text.
/restore
  This is a marker inserted by Paratext when restoring a Paratext project backup
  file, if the switch "Append this description to every retored file" is checked
  in the Restore dialogue. And so, it needs to be present in the stylesheet 
  (so that checking tools do not report a false error), but it is not really 
  part of USFM.
*/


void usfm_categorize_markers_internal (unsigned int marker_count, 
                                       set<ustring>& marker_set,
                                       char * marker_definitions[], 
                                       UsfmCategory category,
                                       vector<ustring>& markers,
                                       vector<UsfmCategory>& categories)
// Internal repetitive function used by usfm_categorize_markers.
{
  for (unsigned int i = 0; i < marker_count; i++) {
    set<ustring>::const_iterator found_position;
    found_position = marker_set.find (marker_definitions[i]);
    if (found_position != marker_set.end()) {
      markers.push_back (marker_definitions[i]);
      categories.push_back (category);
      marker_set.erase (found_position);
    }
  }
}


void usfm_categorize_markers (vector<ustring>& markers, vector<UsfmCategory>& categories)
// Reorders markers and puts them in the right category.
{
  // Put all markers into a set, from where they will be ordered.
  // After ordering known markers, any remaining markers will be put in 
  // category Other.
  set<ustring> set_of_markers (markers.begin(), markers.end());
  // Clear markers and categories. Here we'll put the result of ordering them.
  markers.clear();
  categories.clear();
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_IdentificationInformation, set_of_markers, USFM_MARKERS_IdentificationInformation, ucIdentificationInformation, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_IntroductionTitlesHeadings, set_of_markers, USFM_MARKERS_IntroductionTitlesHeadings, ucIntroductionTitlesHeadings, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_IntroductionParagraphsPoetry, set_of_markers, USFM_MARKERS_IntroductionParagraphsPoetry, ucIntroductionParagraphsPoetry, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_IntroductionOtherElements, set_of_markers, USFM_MARKERS_IntroductionOtherElements, ucIntroductionOtherElements, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_Titles, set_of_markers, USFM_MARKERS_Titles, ucTitles, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_Headings, set_of_markers, USFM_MARKERS_Headings, ucHeadings, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_ChaptersAndVerses, set_of_markers, USFM_MARKERS_ChaptersAndVerses, ucChaptersAndVerses, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_Paragraphs, set_of_markers, USFM_MARKERS_Paragraphs, ucParagraphs, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_Lists, set_of_markers, USFM_MARKERS_Lists, ucLists, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_PoetryElements, set_of_markers, USFM_MARKERS_PoetryElements, ucPoetryElements, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_TableElements, set_of_markers, USFM_MARKERS_TableElements, ucTableElements, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_Footnotes, set_of_markers, USFM_MARKERS_Footnotes, ucFootnotes, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_CrossReferences, set_of_markers, USFM_MARKERS_CrossReferences, ucCrossReferences, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_ExtendedStudyNotes, set_of_markers, USFM_MARKERS_ExtendedStudyNotes, ucExtendedStudyNotes, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_SpecialText, set_of_markers, USFM_MARKERS_SpecialText, ucSpecialText, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_CharacterStyles, set_of_markers, USFM_MARKERS_CharacterStyles, ucCharacterStyles, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_SpacingsAndBreaks, set_of_markers, USFM_MARKERS_SpacingsAndBreaks, ucSpacingsAndBreaks, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_SpecialFeatures, set_of_markers, USFM_MARKERS_SpecialFeatures, ucSpecialFeatures, markers, categories);
  usfm_categorize_markers_internal (NUMBER_OF_MARKERS_PeripheralMaterials, set_of_markers, USFM_MARKERS_PeripheralMaterials, ucPeripheralMaterials, markers, categories);
  // Dump any remaining markers in the "Other" category.
  vector<ustring> other_markers (set_of_markers.begin(), set_of_markers.end());
  for (unsigned int i = 0; i < other_markers.size(); i++) {
    markers.push_back (other_markers[i]);
    categories.push_back (ucNonstandardStyles);
  }
}


ustring usfm_get_category_name (UsfmCategory category)
{
  ustring name;
  switch (category) {
    case ucIdentificationInformation :
    {
      name = "Identification Information";
      break;
    }
    case ucIntroductionTitlesHeadings :
    {
      name = "Introduction Titles and Headings";
      break;
    } 
    case ucIntroductionParagraphsPoetry :
    {
      name = "Introduction Paragraphs and Poetry";
      break;
    } 
    case ucIntroductionOtherElements :
    {
      name = "Introduction Other Elements";
      break;
    } 
    case ucTitles :
    {
      name = "Titles";
      break;
    }
    case ucHeadings :
    {
      name = "Headings";
      break;
    }
    case ucChaptersAndVerses :
    {
      name = "Chapters and Verses";
      break;
    } 
    case ucParagraphs :
    {
      name = "Paragraphs";
      break;
    } 
    case ucLists :
    {
      name = "Lists";
      break;
    } 
    case ucPoetryElements :
    {
      name = "Poetry Elements";
      break;
    } 
    case ucTableElements :
    {
      name = "Table Elements";
      break;
    }
    case ucFootnotes :
    {
      name = "Footnotes";
      break;
    } 
    case ucCrossReferences :
    {
      name = "Cross References";
      break;
    } 
    case ucExtendedStudyNotes :
    {
      name = "Extended Study Notes";
      break;
    } 
    case ucSpecialText :
    {
      name = "Special Text";
      break;
    }
    case ucCharacterStyles :
    {
      name = "Character Styles";
      break;
    }
    case ucSpacingsAndBreaks :
    {
      name = "Spacings and Breaks";
      break;
    }
    case ucSpecialFeatures :
    {
      name = "Special Features";
      break;
    }
    case ucPeripheralMaterials :
    {
      name = "Peripheral Materials";
      break;
    } 
    case ucNonstandardStyles :
    {
      name = "Non-standard Styles";
      break;
    }
  }
  return name;
}


ustring usfm_get_full_opening_marker (const ustring& marker)
{
  return "\\" + marker + " ";
}


ustring usfm_get_full_closing_marker (const ustring& marker)
{
  return "\\" + marker + "*";
}

Generated by  Doxygen 1.6.0   Back to index