package org.eso.phase3.validator;

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.eso.oca.fits.DataTransportFormatHandler;
import org.eso.oca.fits.FileHandlerException;
import org.eso.oca.fits.TypedHeaderCard;
import org.eso.oca.fits.TypedHeaderCardException;
import org.eso.phase3.catalog.domain.LinkType;
import org.eso.phase3.domain.Constants;
import org.eso.phase3.validator.catalog.TXLinkKwdValidator;

import uk.ac.starlink.fits.FitsTableBuilder;
import uk.ac.starlink.table.RowSequence;
import uk.ac.starlink.table.StarTable;
import uk.ac.starlink.table.StoragePolicy;
import uk.ac.starlink.util.FileDataSource;

/**
 * Parse the headers (actually, currently only the header of the primary extension)
 * of a fits file looking for the keywords defining a release's structure. The
 * keywords are defined in the fields matching the regular expressions of: 
 * <li> <code>DATASET_COMPONENT_NAME_KW</code> (definitions of dataset component name).</li> 
 * <li><code>DATASET_COMPONENT_CATG_KW</code> (definitions of dataset component category).</li> 
 * <li><code>CATG_KW</code> (definitions of files' categoryMap).</li> 
 * <li><code>PROVENANCE_KW</code> (definitions of a provenance component).</li> 
 * Note that it is mandatory for a dataset component name keyword to have a 
 * matching dataset component category keyword. To improve performances, it is 
 * assumed that the keywords start with either of NAME_KW_START, CATG_KW_START.
 * The optional keyword DATASET_COMPONENT_MD5SUM_KW has been added for the 
 * declared md5sum values (it must be matched by a DATASET_COMPONENT_NAME_KW).
 * This class check also the preconditions for its fits file, see {@link FailedPrecondition}. 
 * 
 * @author dsforna
 */
public class FitsReleaseStructureParserImp
{
    /**For compressed fits file this keyword marks the header to parse.*/
    public static final String ZSIMPLE_KW = Consts.ZSIMPLE_KW;

    
    /** The value of this keyword is stored as the entry in the categoryMap for this file. */
    public static final String CATG_KW = Consts.CATG_KW;

    /**
     * Regular expression disallowing leading zeros for the indexes. Note that a
     * keyword like ASSON01 does not lead to an error: it is simply ignored.
     */
    public static final String NUMBER_PATTERN_NO_LEADING0 = "([1-9]\\d*)";

    /**
     * Regular expression allowing leading zeros for the indexes. Note that a
     * keyword like ASSON0 is valid.
     */
    public static final String NUMBER_PATTERN_WITH_LEADING0 = "(\\d+)";

    /** Leading string of the dataset component name keywords. */
    public static final String NAME_KW_START = "ASSON";

    /** Leading string of the dataset component category keywords. */
    public static final String CATG_KW_START = "ASSOC";

    /** Leading string of the md5sum component keywords. */
    public static final String MD5SUM_KW_START = "ASSOM";
    
    /** Leading string of the provenance component keywords. */
    public static final String PROVENANCE_KW_START = "PROV";

    /** Regular expression matching any of the dataset component md5sum keywords. */
    private static String DATASET_COMPONENT_MD5SUM_KW;

    /** Regular expression matching any of the dataset component category keywords.*/
    private static String DATASET_COMPONENT_CATG_KW; 

    /** Regular expression matching any of the dataset component name keywords. */
    private static String DATASET_COMPONENT_NAME_KW;

    /** Regular expression matching any of the provenance component keywords. */
    private static String PROVENANCE_KW;

    /** Either with or without leading 0's.*/
    private static String NUMBER_PATTERN;
    
    /* defined in 2.4.2 of SDP standard */
    private static final String EXTPROVENANCE_KW = "PROVXTN";
    
    /** Compile the patterns only once and store them in this Map. */
    private static Map<String, Pattern> kwPattern;

    /**
     * Keywords of the currently implemented preconditions to check.
     * Note the format of an entry: (string, enum-type) where string is the 
     * keyword as parsed in the fits file, enum is its corresponding enum value.
     */
    private static Map<String, FailedPrecondition.KEYWORD> preconditionKeys;
    
    static
    {
        // As default do not accept leading 0.
        acceptLeading0InIndexes(false);
    
        preconditionKeys = new HashMap<String, FailedPrecondition.KEYWORD>();
        for( FailedPrecondition.KEYWORD value: FailedPrecondition.KEYWORD.values())
        {
            String key = value.toString();
            preconditionKeys.put(key, value);
        }
    }

    /** Apache Log4J logger for this class namespace. */
    private static final Logger logger = Logger.getLogger(FitsReleaseStructureParserImp.class);

    /**
     * Set the pattern to use for the indexes in the parsed keywords. This
     * involves recompiling all the depending patterns as well. The parsed
     * keywords (but <code>CATG_KW</code>) have an index matching the pattern
     * stored in <code>NUMBER_PATTERN</code>. The index pattern is set to: <li>
     * <code>NUMBER_PATTERN_WITH_LEADING0</code>, which allows leading 0 digits
     * in the index, if the input boolean is true.</li> <li>
     * <code>NUMBER_PATTERN_NO_LEADING0</code>,which does not allow leading 0
     * digits in the index, if the input boolean is false.</li>
     */
    public static void acceptLeading0InIndexes(boolean accept)
    {
        if (accept)
        {
            NUMBER_PATTERN = NUMBER_PATTERN_WITH_LEADING0;
        }
        else
        {
            NUMBER_PATTERN = NUMBER_PATTERN_NO_LEADING0;
        }

        kwPattern = new HashMap<String, Pattern>();
        kwPattern.put(CATG_KW, Pattern.compile(CATG_KW));

        DATASET_COMPONENT_NAME_KW = "(" + NAME_KW_START + ")(" + NUMBER_PATTERN + ")";
        kwPattern.put(DATASET_COMPONENT_NAME_KW, Pattern.compile(DATASET_COMPONENT_NAME_KW));

        DATASET_COMPONENT_CATG_KW = "(" + CATG_KW_START + ")(" + NUMBER_PATTERN + ")";
        kwPattern.put(DATASET_COMPONENT_CATG_KW, Pattern.compile(DATASET_COMPONENT_CATG_KW));
        
        DATASET_COMPONENT_MD5SUM_KW = "(" + MD5SUM_KW_START+ ")(" + NUMBER_PATTERN + ")";
        kwPattern.put(DATASET_COMPONENT_MD5SUM_KW, Pattern.compile(DATASET_COMPONENT_MD5SUM_KW));

        PROVENANCE_KW = PROVENANCE_KW_START + "(" + NUMBER_PATTERN + ")";
        kwPattern.put(PROVENANCE_KW, Pattern.compile(PROVENANCE_KW));

    }

    /** The files/categories associations defined in this fits. */
    private final Map<String, String> categoryMap;

    /** The dataset defined in this fits. */
    private final Set<String> dataset;

    /** This fits file name. */
    private final String filename;
    
    /** This fits file path and name. */
    private final String filePathName;

    /** The provenance defined in this fits. */
    private final Set<String> provenanceComponents;

    /**From which header to extract the metadata (index =0 for primary header).*/
    private int parsedHeaderIndex; 

    /**Empty list only if f this fits file has the checksum defined in all its header.*/
    private  List<FailedPrecondition> failedPreconditions;
    
    /** The files/md5sum values associations defined in this fits. */
    private final Map<String, String> md5sumMap;

    private int hduNum;
    
    private boolean hasExtensionProvenance = false;
    
    /* max index of PROV keywords: used to check if the sequence is complete */
    private int maxProvIndex = 0;
    
    /**
     * @return the hduNum
     */
    public int getHduNum()
    {
        return hduNum;
    }

    /**
     * Create a file parser for the input fits file. 
     * @param filename the fits file to parse.
     */
    public FitsReleaseStructureParserImp(final File fitsfile) 
    {
        logger.trace("");
        filename = fitsfile.getName();
        filePathName = fitsfile.getAbsolutePath();
        categoryMap = new HashMap<String, String>();
        dataset = new HashSet<String>();
        provenanceComponents = new HashSet<String>();
        md5sumMap = new HashMap<String, String>();
        failedPreconditions = new  ArrayList<FailedPrecondition>();
    }

    /*
     * (non-Javadoc)
     * @see org.eso.phase3.validator.FitsParser#toString()
     */
    @Override
    public String toString()
    {
        String ret = "File " + filename + " dataset = ";
        ret += Arrays.toString(dataset.toArray());
        ret += " provenance = ";
        ret += Arrays.toString(provenanceComponents.toArray());
        ret += " categoryMap = ";
        final Iterator<String> it = categoryMap.keySet().iterator();
        while( it.hasNext() )
        {
            final String key = it.next();
            ret += key + "=" + categoryMap.get(key);
            if (it.hasNext())
            {
                ret += ",";
            }
        }
        return ret;
    }

    /**
     * @return a map of the md5sum defined in this fits header with the
     * entries (filename, md5sum). 
     */
    public Map<String, String> getMd5sumMap()
    {
        return md5sumMap;
    }
    
    /**
     * @return List of failed preconditions found while iterating through the headers.
     * If no precondition failed, the empty list is returned.
     */
    public  List<FailedPrecondition> getFailedPreconditions() 
    {
        return failedPreconditions;
    }
    
    /**
     * @return a map of the categories defined in this fits header with the
     * entries (filename, category). It might be empty but not null. 
     */
    public Map<String, String> getCategoryMap()
    {
        return categoryMap;
    }

    /**
     * @return the dataset parsed from this header. 
     * It might be empty but not null. 
     */
    public Set<String> getDataset()
    {
        return dataset;
    }

    /**
     * @return this filename. 
     */
    public String getFilename()
    {
        return filename;
    }

    /**
     * @return  the provenance definition parsed from this header. 
     * It might be empty but not null. 
     */
    public Set<String> getProvenance()
    {
        return provenanceComponents;
    }
    
    /**
     * Check the preconditions for validity of each header of this fits file.
     * @see {@link FailedPrecondition} for an explanation of the implemented preconditions.
     * @param fh fits file handler.
     * @return 
     * @throws FileHandlerException
     * @throws TypedHeaderCardException
     */
    private TypedHeaderCard[] checkPreconditions(DataTransportFormatHandler fh, int parseHeaderIndex) 
            throws FileHandlerException, TypedHeaderCardException
    {
    	TypedHeaderCard[] returnCards = null;
        for (int headerIndex=0; headerIndex < hduNum; headerIndex++)
        {
            int foundCounter = 0;
            Set<String> foundKeys = new HashSet<String>();
            TypedHeaderCard[] cards = fh.getFITSCards(headerIndex);
            if (headerIndex == parsedHeaderIndex)
            	returnCards = cards;
            for (final TypedHeaderCard card : cards)
            {
                String key = card.getKey();
                if (preconditionKeys.containsKey(key))
                {
                    logger.debug("OK Found keyword "+ key + " in header index="
                            + headerIndex + " of file " + filename);
                    if (foundKeys.add(key))
                    {
                        foundCounter++;                        
                    }
                    else 
                    {
                        logger.warn("Found multiple instances of keyword "+ key 
                                + " in header index=" + headerIndex + " of file " + filename);
                    }
                    
                    if (foundCounter==preconditionKeys.keySet().size())
                    {
                        break;
                    }
                }
            }

            if (foundCounter < preconditionKeys.keySet().size())
            {
                for(String key: preconditionKeys.keySet())
                {
                    if (!foundKeys.contains(key))
                    {
                        logger.warn("Missing keyword "+ key + " in header index=" + headerIndex + " of file " + filename);
                        failedPreconditions.add(new FailedPrecondition(headerIndex, preconditionKeys.get(key)));
                    }
                }
            }
        }
        return returnCards;
    }

    /**
     * Extract the known keywords from the header of this FITS file. Index is 0 
     * for non compressed fits, and either 0 or 1 for compressed fits.
     * @return the index of the parsed header of this file.
     */
    public int parse() throws ParseException
    {
        logger.trace("");
        TypedHeaderCard[] cards = null;
        DataTransportFormatHandler fileHandlerOnlyForParse;
        try
        {
            fileHandlerOnlyForParse = ValidationUtil.allocateDTFH(filePathName);
            hduNum = fileHandlerOnlyForParse.getNumberOfHDUs();
            logger.debug(filename + " has " + hduNum + " HDU(s).");
            parsedHeaderIndex = ValidationUtil.indexHeaderToParse(fileHandlerOnlyForParse, filename);
            logger.debug("Extracting keywords from header index = " + parsedHeaderIndex + " (index range is 0.. "+(hduNum-1)+")");
            cards = checkPreconditions(fileHandlerOnlyForParse, parsedHeaderIndex);
        }
        catch( final FileHandlerException e )
        {
            logger.error(filename + ": " + e.toString());
            throw new ParseException(e.getMessage(), 0);
        }
        catch( final TypedHeaderCardException e )
        {
            logger.error(filename + ": " + e.toString());
            throw new ParseException(e.getMessage(), 0);
        }

        final Map<String, String> datasetKwNameVal = new HashMap<String, String>();
        final Map<String, String> datasetKwCatgVal = new HashMap<String, String>();
        final Map<String, String> datasetKwmd5sumVal = new HashMap<String, String>();
        
        String key = null;
        int cardCounter = 0; // only for logging and exception messages.
        try
        {
            for (final TypedHeaderCard card : cards)
            {
                key=card.getKey(); // for logging.
                cardCounter++;
                extractFromCard(card, datasetKwNameVal, datasetKwCatgVal, datasetKwmd5sumVal);
            }
            
            if (hasExtensionProvenance) {
            	if (provenanceComponents.size() > 0) {
            		String msg = "PROVXTN and PROV keywords are not allowed at the same time";
            		logger.error(filename + ": " + msg);
                    throw new ParseException(msg, 0);
            	} else 
            		extractExtensionProvenance(filePathName);
            } else if (provenanceComponents.size() != maxProvIndex) {
            	String msg = "incomplete sequence of PROV keywords";
            	logger.error(filename + ": " + msg);
                throw new ParseException(msg, 0);
            }
            
//            if (Util.isCatalog(categoryMap.get(filename))) {
//            	boolean hasProvenance = (provenanceComponents.size() > 0);
//            	Set<String> catalogProvenanceComponents = null;
//            	try {
//            		catalogProvenanceComponents = extractCatalogProvenance(filePathName);
//            	} catch (Exception e) {
//            		logger.info(e.getMessage());
//            	}
//            	if (catalogProvenanceComponents != null && catalogProvenanceComponents.size() > 0) {
//            		if (hasProvenance) {
//            			if (hasExtensionProvenance)
//            				throw new IOException("Provenance specified in catalog and in extension");
//            			else
//            				throw new IOException("Provenance specified in catalog and in PROVn keywords");
//            		}
//            		for (String prov: catalogProvenanceComponents)
//            			addProvenanceComponent(prov);
//            	}
//            }
            
//            String category = categoryMap.get(filename);
//            /* if the category is null this is not a main dataset file, therefore not a science file */
//            if (category != null &&
//            		!Util.isBigCatalogMain(category) &&
//            		Util.isScience(category) && 
//            		provenanceComponents.size() == 0)
//            	throw new IOException("No provenance elements specified");
        }
        catch (ParseException|IOException e) 
        {
            // This happened while parsing the values of the keywords.
            final String msg = e.getMessage() + "Header index=" 
                    + parsedHeaderIndex + " (index range is 0.."+(hduNum-1)
                    +"), keyword nr.=" + cardCounter + " key="+ key;
            throw new ParseException(msg, 0);
        }
        finally
        {
            fileHandlerOnlyForParse.dispose();
        }
        
        matchKeywords(datasetKwNameVal, datasetKwCatgVal, datasetKwmd5sumVal);
        logger.debug("Parsing completed for " + filename);
//        logger.trace("Extracted dataset: " + Arrays.toString(dataset.toArray()));
//        logger.trace("Extracted provenance: " + Arrays.toString(provenanceComponents.toArray()));
//        logger.trace("Extracted categories: " + Arrays.toString(categoryMap.entrySet().toArray()));
        return parsedHeaderIndex;
    }
    
    private void extractExtensionProvenance(String filename) throws IOException, ParseException {
    	FitsTableBuilder ftb = new FitsTableBuilder();
    	StarTable table = null;
    	for (int idx = 1; idx < hduNum; idx++) {
    		try {
    			table = ftb.makeStarTable(new FileDataSource(new File(filename), new Integer(idx).toString()), false, StoragePolicy.getDefaultPolicy());
    			if (table.getName().equals("PHASE3PROVENANCE")) 
    				break;
    			else
    				table = null;
    		} catch (IOException e) {
    			logger.info(e.getMessage());
    		}
    	}
    	if (table == null)
    		throw new IOException("No extension with EXTNAME=PHASE3PROVENANCE was found");
    	
    	Integer provColIndex = null;
    	for (int idx = 0; idx < table.getColumnCount(); idx++) {
    		if (table.getColumnInfo(idx).getName().equals("PROV")) {
    			provColIndex = idx;
    			break;
    		}
    	}
    	
    	if (provColIndex == null)
    		throw new IOException("Provenance column name must be PROV, not " + table.getColumnInfo(0).getName());

    	RowSequence rseq = table.getRowSequence();
        while (rseq.next()) {
            String prov = rseq.getCell(provColIndex).toString();
            addProvenanceComponent(prov);
        }
        rseq.close();
	}
    
    private Set<String> extractCatalogProvenance(String filename) throws IOException, ParseException, FileHandlerException, TypedHeaderCardException {
    	FitsTableBuilder ftb = new FitsTableBuilder();
    	StarTable table = null;
    	int idx;
    	for (idx = 1; idx < hduNum; idx++) {
    		try {
    			table = ftb.makeStarTable(new FileDataSource(new File(filename), new Integer(idx).toString()), false, StoragePolicy.getDefaultPolicy());
    			if (table.getName().equals("PHASE3CATALOG"))
    				break;
    			else
    				table = null;
    		} catch (IOException e) {
    			logger.info(e.getMessage());
    		}
    	}

    	if (table == null)
    		throw new IOException("No extension with EXTNAME=PHASE3CATALOG was found");

    	List<Integer> provColIndices = new ArrayList<Integer>();
    	DataTransportFormatHandler dtfh = ValidationUtil.allocateDTFH(filePathName);
    	TypedHeaderCard[] cards = dtfh.getFITSCards(idx);
    	for (TypedHeaderCard card: cards) {
    		if (card.getKey().startsWith(TXLinkKwdValidator.kwIdentifier))
    			try {
    				int colIndex = Integer.valueOf(card.getKey().replaceFirst(TXLinkKwdValidator.kwIdentifier, "")) - 1;
    				LinkType.valueOf(card.getValue());
    				provColIndices.add(colIndex);
    			} catch (Exception e) {
    			}
    	}

    	if (provColIndices.size() == 0)
    		throw new IOException("Found no column with TXLNK ORIGFILE or ARCFILE");

    	Set<String> provenanceComponents = new HashSet<String>();
    	for (int provColIndex: provColIndices) {
    		logger.debug("Extracting provenance from column " + provColIndex);
    		RowSequence rseq = table.getRowSequence();
    		while (rseq.next()) {
    			String prov = rseq.getCell(provColIndex).toString();
    			provenanceComponents.add(prov);
    		}
    		rseq.close();
    	}
    	return provenanceComponents;
    }
//
//    private void extractCatalogProvenance(String filename) throws IOException, ParseException, FileHandlerException, TypedHeaderCardException {
//    	FitsTableBuilder ftb = new FitsTableBuilder();
//    	StarTable table = null;
//    	int idx;
//    	for (idx = 1; idx < hduNum; idx++) {
//    		try {
//    			table = ftb.makeStarTable(new FileDataSource(new File(filename), new Integer(idx).toString()), false, StoragePolicy.getDefaultPolicy());
//        		if (table.getName().equals("PHASE3CATALOG"))
//        			break;
//    		} catch (IOException e) {
//    			logger.info(e.getMessage());
//    		}
//    	}
//    	
//    	if (table == null)
//    		throw new IOException("No extension with EXTNAME=PHASE3CATALOG was found");
//    	
//    	List<Integer> provColIndices = new ArrayList<Integer>();
//    	DataTransportFormatHandler dtfh = ValidationUtil.allocateDTFH(filePathName);
//    	TypedHeaderCard[] cards = dtfh.getFITSCards(idx);
//    	for (TypedHeaderCard card: cards) {
//    		if (card.getKey().startsWith(TXLinkKwdValidator.kwIdentifier))
//    			try {
//    				int colIndex = Integer.valueOf(card.getKey().replaceFirst(TXLinkKwdValidator.kwIdentifier, "")) - 1;
//    				LinkType.valueOf(card.getValue());
//    				provColIndices.add(colIndex);
//    			} catch (Exception e) {
//    			}
//    	}
//    	
//    	if (provColIndices.size() == 0)
//   			throw new IOException("Found no column with TXLNK ORIGFILE or ARCFILE");
//
//    	for (int provColIndex: provColIndices) {
//    		logger.debug("Extracting provenance from column " + provColIndex);
//    		RowSequence rseq = table.getRowSequence();
//    		while (rseq.next()) {
//    			String prov = rseq.getCell(provColIndex).toString();
//    			addProvenanceComponent(prov);
//    		}
//    		rseq.close();
//    	}
//	}

	/**If the input card contains a known release structure keyword, store its value.
     * @param card
     * @param datasetKwNameVal values of dataset component name keywords are stored here. 
     * @param datasetKwCatgVal values of dataset component category keywords are stored here.
     * @throws ParseException
     */
    private void extractFromCard(TypedHeaderCard card, 
            final Map<String, String> datasetKwNameVal,
            final Map<String, String> datasetKwCatgVal,
            final Map<String, String> datasetKwmd5sumVal) throws ParseException
    {
        String key = card.getKey();
        final String val = card.getValue();
        logger.trace("Extracted: " + key + "=" + val);
      
        // NOTE: startsWith() in the block below is used only to improve performance.
        if (key.equals(CATG_KW))
        {
            
            if (Util.isScience(val)) {
                addCatg(filename, val);
            } else {
                String msg = "Keyword " + CATG_KW  
                        + " is allowed only in science files, i.e. the value must start with: "
                        + Constants.SCIENCE_CATG_START 
                        + " Parsed value will be ignored [" + val + "]";
                throw new ParseException(msg, 0);
            }
        }
        else if (key.startsWith(NAME_KW_START) && (kwPattern.get(DATASET_COMPONENT_NAME_KW).matcher(key)).matches())
        {
            logger.debug("Found a dataset component key=val: " + key + "=" + val);
            datasetKwNameVal.put(key, val);
        }
        else if (key.startsWith(CATG_KW_START) && (kwPattern.get(DATASET_COMPONENT_CATG_KW).matcher(key)).matches())
        {
            logger.debug("Found a dataset component category key=val: " + key + "=" + val);
            /* DFS11628 */
            if (Util.isScience(val))
            {
                String msg = "Category starting with " 
                        + Constants.SCIENCE_CATG_START 
                        + " is allowed only in keyword " + CATG_KW 
                        + " (a science file cannot be defined as dataset component) - Parsed value will be ignored [" 
                        + val + "]"; 
                throw new ParseException(msg, 0);
            }
            else 
            {
                datasetKwCatgVal.put(key, val);
            }
        }
        else if (key.startsWith(MD5SUM_KW_START) && (kwPattern.get(DATASET_COMPONENT_MD5SUM_KW).matcher(key)).matches())
        {
            logger.debug("Found a dataset component md5sum key=val: "
                    + key + "=" + val);
            datasetKwmd5sumVal.put(key, val);
        }
        else if (key.equals(EXTPROVENANCE_KW))
        {
            logger.debug("Found a provenance component name: " + key + "=" + val);
            Object obj = card.getValueObject();
            if (obj instanceof Boolean)
            	 hasExtensionProvenance = (Boolean) obj;
            else
            	 logger.error("Keyword " + key + " must be a boolean, not a " + obj.getClass().toString());
        }        
        else if (key.startsWith(PROVENANCE_KW_START))
        {
        	Matcher m = kwPattern.get(PROVENANCE_KW).matcher(key);
        	if (m.matches()) {
        		logger.debug("Found a provenance component name: " + key + "=" + val);
        		int index = Integer.valueOf(m.group(1));
        		if (index > maxProvIndex)
        			maxProvIndex = index;
        		addProvenanceComponent(val);
        	}
        }
        else
        {
            logger.trace( key + " is not a keyword for the release structure. Skipping.");
        }
    }


    /**Build the dataset for this file from the parsed pairs of dataset names 
     * and categories. 
     * @param datasetKwNameVal the parsed names
     * @param datasetKwCatgVal the parsed categories
     * @throws ParseException
     */
    private void matchKeywords(
            final Map<String, String> datasetKwNameVal,
            final Map<String, String> datasetKwCatgVal,
            final Map<String, String> datasetKwmd5sumVal) throws ParseException
    {
        List<String> matchedNameKeywords = new ArrayList<String>();
        if (datasetKwCatgVal.keySet().size() > 0)
        {
            logger.debug("Adding the parsed dataset name/category components ("
                    + datasetKwCatgVal.keySet().size()
                    + ") to the global map.");
        }

        // Match the md5sum/component name keywords:
        final Iterator<String> itMd5sum = datasetKwmd5sumVal.keySet().iterator();
        while( itMd5sum.hasNext() )
        {
            final String md5sumKeyword = itMd5sum.next();
            final String md5sum = datasetKwmd5sumVal.get(md5sumKeyword);
            final String componentNameKeyword = siblingNameKeyword(md5sumKeyword);
            String filenameForMd5sum= datasetKwNameVal.get(componentNameKeyword);
            md5sumMap.put(filenameForMd5sum, md5sum);
            
        }
        
        // Match the category/component name keywords:
        final Iterator<String> itCatg = datasetKwCatgVal.keySet().iterator();
        while( itCatg.hasNext() )
        {
            final String categoryKeyword = itCatg.next();
            final String category = datasetKwCatgVal.get(categoryKeyword);
            final String componentNameKeyword = siblingKeyword(categoryKeyword);
            
            // The category must be associated with a filename:
            if (!datasetKwNameVal.containsKey(componentNameKeyword))
            {
                final String msg = "Extension id=" + parsedHeaderIndex
                        + " . Parsed category from:" + categoryKeyword
                        + "=" + category
                        + " But the matching name keyword ["
                        + componentNameKeyword + "] was not found.";
                logger.error(msg);
                throw new ParseException(msg, 0);
            }
            else
            {
                matchedNameKeywords.add(componentNameKeyword);
                final String filenameForCategory = datasetKwNameVal.get(componentNameKeyword);
                logger.debug("Adding to the category map: "
                        + filenameForCategory + ", " + category);
                addCatg(filenameForCategory, category);
                logger.debug("Adding to the dataset of this file: "
                        + filenameForCategory);
                if (!dataset.add(filenameForCategory))
                {
                    final String msg = "Extension id=" + parsedHeaderIndex
                            + " . Duplicated dataset component from:"
                            + componentNameKeyword + "="
                            + filenameForCategory;
                    throw new ParseException(msg, 0);
                }
            }
        }

        // Check that no dataset component name is without a category:
        final Iterator<String> itName = datasetKwNameVal.keySet().iterator();
        while (itName.hasNext())
        {
            String componentNameKeyword = itName.next();
            if (!matchedNameKeywords.contains(componentNameKeyword))
            {
                final String msg = "Extension id=" + parsedHeaderIndex
                        + " . Parsed dataset component name keyword:"
                        + componentNameKeyword
                        + " But the matching category keyword ["
                        + siblingKeyword(componentNameKeyword)
                        + "] was not found.";
                logger.error(msg);
                throw new ParseException(msg, 0);
            }
        }

        // DFS09690 SCIENCEDATASET
        if (dataset.isEmpty())
        {
            if (categoryMap.containsKey(filename))
            {
                if (Util.isScience(categoryMap.get(filename)))
                {
                    logger.debug("Create a dataset just for this science file.");
                    dataset.add(filename);
                }
            }
        }
        else 
        {
            // Explicitly add this filename as component of the dataset:
            dataset.add(filename);
        }
    }

    /**
     * Add a filename/category pair to the category Map
     * 
     * @param key
     *            the file for which the category is defined
     * @param val
     *            the category to assign to the file
     * @throws ParseException
     *             in case of null or duplicated category.
     */
    private void addCatg(String key, String val) throws ParseException
    {
        logger.trace("");
        String msg = "";
        if (key == null)
        {
            msg = "Null input filename of category.";
            logger.error(msg);
            throw new ParseException(msg, 0);
        }

        if (val == null)
        {
            msg = "Null input category for file " + key;
            logger.error(msg);
            throw new ParseException(msg, 0);
        }

        key = key.trim();
        val = val.trim();
        if (key.length() == 0)
        {
            msg = "Empty filename";
            logger.error(msg);
            throw new ParseException(msg, 0);
        }

        if (val.length() == 0)
        {
            msg = "Empty category for file " + key;
            logger.error(msg);
            throw new ParseException(msg, 0);
        }

        logger.debug("Found for file " + key + " category: " + val);
        if (categoryMap.containsKey(key))
        {
            msg = "Multiple category definition for file " + key + " [previous: "
                    + categoryMap.get(key) + " new: " + val + "]";
            logger.error(filename + ": " + msg);
            throw new ParseException(msg, 0);
        }
        categoryMap.put(key, val);
    }

    /**
     * Parse the value associated to the provenance keyword and add it to the
     * input list.
     * 
     * @param val
     *            the value to parse which represent a new component of the
     *            provenance.
     * @throws ParseException
     */
    private void addProvenanceComponent(final String val) throws ParseException
    {
        logger.trace("");
        String msg = "";
        if (val == null)
        {
            msg = "Null keyword value.";
            logger.error(filename + ": " + msg);
            throw new ParseException(msg, 0);
        }
        final String component = val.trim();
        if (component.length() == 0)
        {
            msg = "Empty keyword value.";
            logger.error(filename + ": " + msg);
            throw new ParseException(msg, 0);
        }

        if (provenanceComponents.contains(component))
        {
            msg = "Duplicated provenance component found: [" + component + "]";
            logger.error(filename + ": " + msg);
            throw new ParseException(msg, 0);
        }
        logger.debug("Adding to the provenance definition of file " + filename
                + " the component " + component);
        provenanceComponents.add(component);
    }

    /**
     * Dataset name keywords (i.e. matching DATASET_COMPONENT_NAME_KW) and
     * dataset name category keywords (i.e. matching DATASET_COMPONENT_CATG_KW)
     * must go in pair. This method takes in input either: <li>a keyword
     * matching DATASET_COMPONENT_NAME_KW and return the corresponding keyword
     * for the category.</li> <li>a keyword matching DATASET_COMPONENT_CATG_KW
     * and return the corresponding keyword for the name.
     * 
     * @param nameOrCatg
     *            the name or category keyword
     * @return the matching keyword.
     */
    private String siblingKeyword(final String nameOrCatg)
    {
        logger.trace("");
        if (nameOrCatg == null)
        {
            logger.error("Null input argument: nameOrCatg");
            throw new IllegalArgumentException(
                    "Null input argument: nameOrCatg");
        }
        Matcher m = null;
        String siblingKeywordStart = null;
        if (nameOrCatg.startsWith(CATG_KW_START))
        {
            logger.debug(nameOrCatg + " is a keyword for a category value.");
            m = kwPattern.get(DATASET_COMPONENT_CATG_KW).matcher(nameOrCatg);
            siblingKeywordStart = NAME_KW_START;
        }
        else if (nameOrCatg.startsWith(NAME_KW_START))
        {
            siblingKeywordStart = CATG_KW_START;
            logger.debug(nameOrCatg + " is a keyword for a name value");
            m = kwPattern.get(DATASET_COMPONENT_NAME_KW).matcher(nameOrCatg);
        }
        else
        {
            final String msg = "Invalid input keyword: " + nameOrCatg
                    + " (should start with either " + CATG_KW_START + " or "
                    + NAME_KW_START + " followed by a number.";
            logger.error(filename + ": " + msg);
            throw new IllegalArgumentException(msg);
        }

        if (!m.matches())
        {
            String msg = nameOrCatg + " does not match its regular expression.";
            logger.error(msg);
            throw new IllegalArgumentException(msg);
        }
        String numberString = m.group(2);
        logger.debug("Found the keyword " + nameOrCatg
                + " therefore the sibling keyword must be found as well: "
                + siblingKeywordStart + "" + numberString);
        return siblingKeywordStart + numberString;
    }
    
    /**
     * Match DATASET_COMPONENT_NAME_KW keyword for the input DATASET_COMPONENT_MD5SUM_KW
     * keyword. This means, for instance: if the input is the string "ASSOM3", 
     * the output will be "ASSON3". Alike to {@link#siblingKeyword}, but here the 
     * input is always a  DATASET_COMPONENT_MD5SUM_KW.
     * @param md5sumKeyword
     * @return
     */
    private String siblingNameKeyword(final String md5sumKeyword)
    {
        logger.trace("");
        if (md5sumKeyword == null)
        {
            logger.error("Null input argument: md5sumKeyword");
            throw new IllegalArgumentException(
                    "Null input argument: md5sumKeyword");
        }
        Matcher m = null;
        String siblingKeywordStart = null;
        if (md5sumKeyword.startsWith(MD5SUM_KW_START))
        {
            logger.debug(md5sumKeyword + " is a keyword for a md5sum value.");
            m = kwPattern.get(DATASET_COMPONENT_MD5SUM_KW).matcher(md5sumKeyword);
            siblingKeywordStart = NAME_KW_START;
        }
        else
        {
            final String msg = "Invalid input keyword: " + md5sumKeyword
                    + " (should start with " + MD5SUM_KW_START 
                    + " followed by a number).";
            logger.error(filename + ": " + msg);
            throw new IllegalArgumentException(msg);
        }

        if (!m.matches())
        {
            String msg = md5sumKeyword + " does not match its regular expression.";
            logger.error(msg);
            throw new IllegalArgumentException(msg);
        }
        String numberString = m.group(2);
        logger.debug("Found the keyword " + md5sumKeyword
                + " therefore the sibling keyword must be found as well: "
                + siblingKeywordStart + "" + numberString);
        return siblingKeywordStart + numberString;
    }
}
