Mòideal:Language/data/ISO 639-2/make

(deasbaireachd⧼tpt-languages-separator⧽deasaich⧼tpt-languages-separator⧽eachdraidh⧼tpt-languages-separator⧽ceanglaichean⧼tpt-languages-separator⧽doc⧼tpt-languages-separator⧽bogsa-gainmhich⧼tpt-languages-separator⧽cùisean deuchainn)

This module is currently protected from editing.
See the protection policy and protection log for more details. Please discuss any changes on the talk page; you may submit an edit request to ask an administrator to make an edit if it is uncontroversial or supported by consensus. You may also request that this page be unprotected.

This module is subject to page protection. It is a highly visible module in use by a very large number of pages, or is substituted very frequently. Because vandalism or mistakes would affect many pages, and even trivial editing might cause substantial load on the servers, it is protected from editing.

Reads a local copy of data from the table at Codes for the Representation of Names of Languages, extracts the ISO 639-2 codes, names, and ISO 639-1 synonyms. There are two functions in the tool: ISO_synonym_extract() and ISO_2_name_extract().

Ùsaid - Usage

To use this tool:

open a blank sandbox page and paste either or both of these {{#invoke:}}s into it at the top:
for use in Mòideal:Lang/ISO 639 synonyms:
- {{#invoke:Language/data/ISO 639-2/make|ISO_synonym_extract|file-date=YYYY-MM-DD
for use in Mòideal:Language/data/ISO 639-2:
- {{#invoke:Language/data/ISO 639-2/make|ISO_2_name_extract|file-date=YYYY-MM-DD
go to the current Codes for the Representation of Names of Languages. Copy the content of the table on that page and paste it into the sandbox page below the {{#invoke:}}.
click Show preview
wait
get result

require('Module:No globals');

--[=[------------------------< I S O _ S Y N O N Y M _ E X T R A C T >-----------------------------------------

{{#invoke:Language/data/ISO 639-2/make|ISO_synonym_extract|file-date=2013-01-11}}

reads a local copy of data from the table at http://www.loc.gov/standards/iso639-2/php/English_list.php, extracts
the ISO 639-2 (or 639-2T) codes that have equivalent ISO 639-1 codes and creates a table to translate 639-2 to 639-1.
ISO-639-3 uses 639-2T codes

useful lines in the source table have the form:
	<English name>\t<all English names>\t<all French names>\t<639-2 code>\t<639-1 code>\n
where:
	<English name> is primary English name (not used here); one of <all English names> so duplicates code listing
	<all English names> is all of the English names (not used here)
	<all French names> is all of the French names (not used here)
	<639-2 code> is the three-character ISO 639-2 or 639-2B/639-2T language code; when 639-2T present, use that code
	<639-1 code> is the two-character ISO 639-1 language code synonym of the -2 code (if one is defined)
		
	like this (with synonym):
		Abkhazian	Abkhazian	abkhaze	abk	ab
	or (without synonym):
		Achinese	Achinese	aceh	ace	 

for the file date use the date listed at the bottom of the source page in yyyymmdd numeric format without hyphens or spaces

]=]

local function ISO_synonym_extract (frame)
	local page = mw.title.getCurrentTitle();									-- get a page object for this page
	local content = page:getContent();											-- get unparsed content
	local content_table = {};													-- table of text lines from source
	local split_table = {};														-- table of lines split at the tabs	
	local skip_table = {};														-- table of 636-2/639-2T codes that have been handled; used to prevent duplication
	local out_table = {};														-- output table
	
	local file_date = 'File-Date: ' .. frame.args["file-date"];					-- set the file date line from |file-date= (from the bottom of the source page)

	content_table = mw.text.split (content, '[\r\n]');							-- make a table of text lines
	for _, line in ipairs (content_table) do									-- for each line
		split_table = mw.text.split (line, '\t');								-- split at the table
		if split_table[5] and (' ' ~= split_table[5]) then						-- if there is a 639-1 code
			local code = split_table[4]:match ('%a+/(%a+)') or split_table[4];	-- when 639-2B/639-2T use 639-2T else use 639-2
			if not skip_table[code] then										-- skip if code already in the skip table because more than one language name
				skip_table[code] = true;										-- remember that we've handled this 636-2/639-2T code
				table.insert (out_table, "[\"" .. code .. "\"] = \"" .. split_table[5] .. "\"");		-- make new table entry
			end
		end
	end
	
	table.sort (out_table);
	
	return "<br /><pre>-- " .. file_date .. "<br />return {<br />&#9;" .. table.concat (out_table, ',<br />&#9;') .. "<br />&#9;}<br />" .. "</pre>";
end


--[[--------------------------< I S O _ 2 _ N A M E _ E X T R A C T >------------------------------------------

{{#invoke:Language/data/ISO 639-2/make|ISO_2_name_extract|file-date=2013-01-11}}

reads a local copy of data from the table at http://www.loc.gov/standards/iso639-2/php/English_list.php, extracts
the ISO 639-2 and 639-2T codes and their associated language names

useful lines in the source table have the form:
	<English name>\t<all English names>\t<all French names>\t<639-2 code>\t<639-1 code>\n
where:
	<English name> is primary English name (not used here); one of <all English names> so duplicates code listing
	<all English names> is all of the English names (used here)
	<all French names> is all of the French names (not used here)
	<639-2 code> is the three-character ISO 639-2 or 639-2B/639-2T language code; both are used
	<639-1 code> is the two-character ISO 639-1 language code synonym of the -2 code (not used here)

for the file date use the date listed at the bottom of the source page in yyyymmdd numeric format without hyphens or spaces

]]

local function ISO_2_name_extract (frame)
	local page = mw.title.getCurrentTitle();									-- get a page object for this page
	local content = page:getContent();											-- get unparsed content
	local content_table = {};													-- table of text lines from source
	local split_table = {};														-- table of lines split at the tabs	
	local skip_table = {['qaa-qtz']=true};										-- table of 636-2/639-2T codes that have been handled; used to prevent duplication; qaa-qtz reserved for local use so not supported here
	local name_table = {};														-- holds language names for processing
	local code_table = {};														-- because some languages have both -2B and -2T codes
	local out_table = {};														-- output table
	
	local file_date = 'File-Date: ' .. frame.args["file-date"];					-- set the file date line from |file-date= (from the bottom of the source page)

	content_table = mw.text.split (content, '[\r\n]');							-- make a table of text lines
	for _, line in ipairs (content_table) do									-- for each line
		split_table = mw.text.split (line, '\t');								-- split at the tab

		if split_table[4] then													-- if a code then continue processing; skip this line else
			name_table = mw.text.split (split_table[2], ' *; *');				-- split 'all English names' at the '; ' into a table of individual names
			for i, v in ipairs (name_table) do
				name_table [i] = mw.ustring.gsub (v, '(.+)', '"%1"');			-- add double quotes around each name
			end
			
			code_table = mw.text.split (split_table[4], ' */ *');				-- split 'ISO 639-2' code at the '/' into a table of -2B and -2T individual codes
			
			for _, code in ipairs (code_table) do								-- now built a table entry for the code(s) and its(their) associated language(s)
				if not skip_table[code] then									-- source data has duplicates so check to see if we have already done this code
					table.insert (out_table,
						table.concat ({
							'["',												-- open code index
							code,												-- the code
							'"] = {',											-- close code index; open name table
							table.concat(name_table, ', '),						-- add the names
							'}'													-- close the names table
						})
					)
					skip_table[code] = true;									-- remember that we've done this code
				end
			end
		end
	end
	
	table.sort (out_table);
	
	return "<br /><pre>-- " .. file_date .. "<br />return {<br />&#9;" .. table.concat (out_table, ',<br />&#9;') .. "<br />&#9;}<br />" .. "</pre>";
end


--[[--------------------------< E X P O R T E D   F U N C T I O N S >------------------------------------------
]]

return {
	ISO_synonym_extract = ISO_synonym_extract,
	ISO_2_name_extract = ISO_2_name_extract
	};