Source code for impacts_estimation.utils

""" Functions used by the environmental impact estimation program """

import copy
import numpy as np
from math import sqrt

from impacts_estimation.vars import NUTRIMENTS_CATEGORIES, TOP_LEVEL_NUTRIMENTS_CATEGORIES, \
    AGRIBALYSE_IMPACT_CATEGORIES_EN_TO_FR, AGRIBALYSE_IMPACT_CATEGORIES_FR
from data import ingredients_data, off_taxonomy


[docs]def nutriments_from_recipe(recipe):
    """
    Return the nutriments content of a product recipe by a weighted sum of the ingredients masses and reference
    nutriment contents.

    Args:
        recipe (dict): Dict containing ingredients as keys and masses in grams as values

    Warning:
        Any ingredients whose nutriment content is unknown will be considered to have the average nutriment content
        of the product.

    Returns:
        dict: Dictionary with nutriments as keys and nutriment contents as values
    """

    result = dict()
    total_mass = sum([float(x) for x in recipe.values()])

    for nutriment in NUTRIMENTS_CATEGORIES:
        known_ingredients_mass = 0
        result[nutriment] = 0
        for ingredient in recipe:
            if (ingredient in ingredients_data) and (nutriment in ingredients_data[ingredient].get('nutriments', [])):
                known_ingredients_mass += float(recipe[ingredient])
                result[nutriment] += float(recipe[ingredient]) * \
                                     ingredients_data[ingredient]['nutriments'][nutriment]['value'] \
                                     / 100  # Ingredients nutriment contents are given per 100g

        if known_ingredients_mass == 0:
            del result[nutriment]
        # Inflating the nutriment content of the known ingredients to the nutriment content of the total mass of these
        # ingredients
        else:
            result[nutriment] = result[nutriment] * total_mass / known_ingredients_mass

    return result


[docs]def confidence_score(nutri, reference_nutri, total_mass, min_possible_mass, max_possible_mass, weighting_factor=10,
                     reference_mass=100):
    """
    Calculate the confidence score of a nutritional composition using the euclidean distance between the reference
    nutritional composition and the assessed nutritional composition in the space of all considered nutriments
    contents and the total mass of ingredients used. The closer the nutritional composition is from the reference, the
    higher the confidence score is. The nearest of 100g/100g the total mass of ingredients is, the higher the confidence
     score is.

    The score is defined as the inverse of the sum of the nutritional distance and the absolute difference between the
    total mass and 100g/100g weighted by a weighting factor.

    Args:
        nutri (dict): Nutritional composition to evaluate.
        reference_nutri (dict): Nutritional composition of the reference product.
        total_mass (float): Total mass of ingredients used in g.
        min_possible_mass (float): Minimum possible total ingredient mass for a product in g
        max_possible_mass (float): Maximum possible total ingredient mass for a product in g
        weighting_factor (float): Weight of the nutritional distance against the absolute difference between
         the total mass and 100g/100g.
        reference_mass (float): Mass for which the nutritional compositions are expressed (in g).

    Returns:
        float: Confidence score
    """
    assert round(min_possible_mass) <= round(total_mass) <= round(max_possible_mass)

    total_mass = total_mass / reference_mass
    min_possible_mass = min_possible_mass / reference_mass
    max_possible_mass = max_possible_mass / reference_mass

    # Removing "_100g" from reference_nutri keys
    reference_nutri = {k.replace('_100g', ''): v for k, v in reference_nutri.items()}

    # Calculating nutritional distance
    squared_differences = []
    n = 0
    for nutriment in nutri:
        if nutriment in TOP_LEVEL_NUTRIMENTS_CATEGORIES:
            if nutriment in reference_nutri:
                n += 1  # Incrementing the number of considered dimensions (nutriments)
                difference = (float(reference_nutri[nutriment]) / reference_mass) \
                             - (float(nutri[nutriment]) / reference_mass)

                squared_difference = round(difference ** 2, 6)

                # Setting a minimal squared difference to avoid extremely high confidence score values in case of very
                # similar nutritional compositions
                squared_difference = max(squared_difference, 0.0000001)

                if squared_difference > 1:
                    raise ValueError("The squared difference cannot be superior to 1.")

                squared_differences.append(squared_difference)

    # The distance in the n-dimensional space is the square root of the sum of the squared differences
    nutri_distance = sqrt(sum(squared_differences))

    # Normalizing by the maximum possible distance sqrt(2)
    normalized_nutri_distance = nutri_distance / sqrt(2)

    # Calculating total mass likelihood coefficient
    if total_mass < 1:
        mass_diff = (1 - total_mass) / (1 - min_possible_mass)
    else:
        mass_diff = (total_mass - 1) / (max_possible_mass - 1)

    return 1 / ((normalized_nutri_distance * weighting_factor) + mass_diff)


[docs]def natural_bounds(rank, nb_ingredients):
    """
    Computes the upper and lower bounds of the proportion of an ingredient depending on its rank and the number of
    ingredients in the product given that they are in decreasing proportion order.

    Examples:
        >>> natural_bounds(2, 4)
        (0.0, 50.0)
        >>> natural_bounds(1, 5)
        (20.0, 100.0)

    Args:
        rank (int): Rank of the ingredient in the list
        nb_ingredients (int): Number of ingredients in the product

    Returns:
        tuple: Lower and upper bounds of the proportion of the ingredient
    """
    if rank == 1:
        return 100 / nb_ingredients, 100.0
    else:
        return .0, 100 / rank


[docs]def nutritional_error_margin(nutriment, value):
    """
    Returns the error margin of a product's nutriment according to EU directives

    Args:
        nutriment (str): Nutriment considered
        value (float): Given product content of the considered nutriment

    Returns:
        dict: Dictionary containing absolute and relative margins (only one of which is different from 0)

    Examples:
        >>>nutritional_error_margin('proteins', 0.05)
        {'absolute': 0.02, 'relative': 0}
        >>>nutritional_error_margin('proteins', 0.3)
        {'absolute': 0, 'relative': 0.2}
    """

    value = float(value)
    assert 0 <= value <= 1

    if nutriment.lower() in ('proteins', 'carbohydrates', 'sugars', 'fiber'):
        if 0 <= value < 0.1:
            return {'absolute': 0.02, 'relative': 0}
        elif 0.1 <= value < 0.4:
            return {'absolute': 0, 'relative': 0.2}
        elif 0.4 <= value <= 1:
            return {'absolute': 0.08, 'relative': 0}

    elif nutriment.lower() == 'fat':
        if 0 <= value < 0.1:
            return {'absolute': 0.015, 'relative': 0}
        elif 0.1 <= value < 0.4:
            return {'absolute': 0, 'relative': 0.2}
        elif 0.4 <= value <= 1:
            return {'absolute': 0.08, 'relative': 0}

    elif nutriment.lower() == 'saturated-fat':
        if 0 <= value < 0.04:
            return {'absolute': 0.008, 'relative': 0}
        elif 0.04 <= value <= 1:
            return {'absolute': 0, 'relative': 0.2}

    elif nutriment.lower() == 'salt':
        if 0 <= value < 0.0125:
            return {'absolute': 0.00375, 'relative': 0}
        elif 0.0125 <= value <= 1:
            return {'absolute': 0, 'relative': 0.2}

    else:
        raise ValueError('The nutriment is not recognized.')


[docs]def clear_ingredient_graph(product):
    """
    Recursive function to search the ingredients graph and remove subingredients if all subingredients of a same
    ingredient are uncharacterized

    Args:
        product (dict): Dict corresponding to a product or a compound ingredient.
    """
    ingredients = product['ingredients']
    for ingredient in ingredients:
        # If the ingredient has subingredients, recursively call the function
        if 'ingredients' in ingredient:
            clear_ingredient_graph(ingredient)

    # If no subingredients are known, have known subingredients or defined percentage, delete them all
    if len([x for x in ingredients
            if (x['id'] in ingredients_data)
               or ('ingredients' in x)
               or ('percent' in x)]
           ) == 0:
        del product['ingredients']


[docs]def minimum_percentage_sum(ingredients):
    """
    Computes the minimum sum of ingredients percentages for ingredients given in decreasing percentage order, even if
    some ingredients does not have a percentage.

    Notes:
        This is useful to estimate if subingredients percentages are defined in percentage of their parent ingredient
        or in percentage of the total product.

    Args:
        ingredients (list): List of dicts corresponding to the ingredients

    Returns:
        float: Minimum value of the sum of all ingredients percentages.
    """

    # Looping from least present ingredient to most present
    ingredients = copy.deepcopy(ingredients)
    ingredients.reverse()
    minimum_sum = 0
    minimum_percentage = 0
    for ingredient in ingredients:
        if 'percent' in ingredient:
            minimum_percentage = float(ingredient['percent'])

        minimum_sum += minimum_percentage

    return minimum_sum


[docs]def maximum_percentage_sum(ingredients):
    """
    Computes the maximum sum of ingredients percentages for ingredients given in decreasing percentage order, even if
    some ingredients does not have a percentage.

    Notes:
        This is useful to estimate if subingredients percentages are defined in percentage of their parent ingredient
        or in percentage of the total product.

    Args:
        ingredients (list): List of dicts corresponding to the ingredients

    Returns:
        float: Maximum value of the sum of all ingredients percentages.
    """

    maximum_sum = 0
    maximum_percentage = 100
    for ingredient in ingredients:
        if 'percent' in ingredient:
            maximum_percentage = float(ingredient['percent'])

        maximum_sum += maximum_percentage

    return maximum_sum


[docs]def define_subingredients_percentage_type(product):
    """
    Recursive function to search the ingredients graph and define if the subingredients percentages are defined as
    percentage of their parent ingredient or the whole product.

    Args:
        product (dict): Dict corresponding to a product or a compound ingredient.
    """
    for rank, ingredient in enumerate(product['ingredients'], 1):
        if ingredient.get('ingredients'):

            # Recursive call for each subingredients:
            define_subingredients_percentage_type(ingredient)

            if not any('percent' in x for x in ingredient['ingredients']):
                continue

            parent_percentage = True
            product_percentage = True
            # If the maximum sum of the subingredients percentages is lower than 100, then the percentages cannot
            # be given in percentage of the parent
            if maximum_percentage_sum(ingredient['ingredients']) < 100:
                parent_percentage = False

            # If the minimum sum of the subingredients percentages is higher than the parent ingredient percentage
            # or its natural upper bound (if the parent has no percentage), then the subingredients percentages
            # cannot be given in percentage of the product
            parent_ingredient_percentage = min(float(ingredient.get('percent', 100)),
                                               natural_bounds(rank, len(product['ingredients']))[1])

            if minimum_percentage_sum(ingredient['ingredients']) > parent_ingredient_percentage:
                product_percentage = False
            if parent_percentage and not product_percentage:
                ingredient['percent-type'] = 'parent'
            elif product_percentage and not parent_percentage:
                ingredient['percent-type'] = 'product'
            else:
                ingredient['percent-type'] = 'undefined'


[docs]def flat_ingredients_list_BFS(product):
    """
    Recursive function to search the ingredients graph by doing a Breadth First Search and return it as a flat list of
    all nodes.
    Sub ingredients are placed at the end of the list.

    Args:
        product (dict): Dict corresponding to a product or a compound ingredient.

    Returns:
        list: List containing all the ingredients graph nodes.
    """
    nodes = []
    if 'ingredients' in product:
        ingredients = copy.deepcopy(product['ingredients'])  # Deepcopy to avoid deleting the graph structure
        nodes += ingredients

        for ingredient in ingredients:
            nodes += flat_ingredients_list_BFS(ingredient)

            if 'ingredients' in ingredient:
                del ingredient['ingredients']

    return nodes


[docs]def flat_ingredients_list_DFS(product):
    """
    Recursive function to search the ingredients graph by doing a Depth First Search and return it as a flat list of
    all nodes.
    Sub ingredients are placed right after their parents.

    Args:
        product (dict): Dict corresponding to a product or a compound ingredient.

    Returns:
        list: List containing all the ingredients graph nodes.
    """
    if 'ingredients' in product:
        product_without_ingredients = copy.deepcopy(product)
        del product_without_ingredients['ingredients']

        if '_id' in product:  # It is a product and not a compound ingredient:
            return [y for x in product['ingredients'] for y in flat_ingredients_list_DFS(x)]
        else:
            return [product_without_ingredients] + [y for x in product['ingredients'] for y in
                                                    flat_ingredients_list_DFS(x)]
    else:
        return [product]


[docs]def find_ingredients_graph_leaves(product):
    """
    Recursive function to search the ingredients graph and find its leaves.

    Args:
        product (dict): Dict corresponding to a product or a compound ingredient.

    Returns:
        list: List containing the ingredients graph leaves.
    """

    if 'ingredients' in product:
        leaves = []
        for ingredient in product['ingredients']:
            subleaves = find_ingredients_graph_leaves(ingredient)

            if type(subleaves) == list:
                leaves += subleaves
            else:
                leaves.append(subleaves)

        return leaves

    else:
        return product


[docs]def individualize_ingredients(product, previous_ingredients_ids=None):
    """
    Process an ingredient list in place to ensure that they all have a different id.

    Args:
        product (dict): Dict corresponding to a product, containing a list of ingredients, may contain compound
            ingredients
        previous_ingredients_ids (list): List containing ingredients ids. Needed only for recursive call

    Examples:
        >>> product = {'ingredients': [{'id': 'A'}, {'id': 'B', 'ingredients': [{'id': 'A'}]}, {'id': 'B'}]}
        >>> individualize_ingredients(product)
        >>> print(product)
        {'ingredients': [{'id': 'A'}, {'id': 'B', 'ingredients': [{'id': 'A*'}]}, {'id': 'B*'}]}
    """
    ingredients_ids = previous_ingredients_ids or []

    for ingredient in product['ingredients']:
        # Appending an asterisk to the id as long as the id already exists
        while ingredient['id'] in ingredients_ids:
            ingredient['id'] += '*'

        ingredients_ids.append(ingredient['id'])

        if 'ingredients' in ingredient:
            individualize_ingredients(ingredient, previous_ingredients_ids=ingredients_ids)


[docs]def original_id(individualized_id):
    """
    Gets the original id of an ingredient that has been transformed by individualize_ingredients()

    Args:
        individualized_id (str):

    Returns:
        str:

    Examples:
        >>> original_id('en:water**')
        'en:water'
        >>> original_id('en:sugar')
        'en:sugar'
    """

    return individualized_id.strip('*')


[docs]class UnknownIngredientsRemover:
    def __init__(self):
        self.removed_unknown_ingredients = []

[docs]    def remove_unknown_ingredients(self, product):
        """
            Recursive function to remove ingredients if they are not in the OFF taxonomy or if they do not have a
            defined percentage or valid subingredients.
        """

        if 'ingredients' in product:

            # Recursive call on each subingredients
            for ingredient in product['ingredients']:
                self.remove_unknown_ingredients(ingredient)

            # Creating an iteration copy
            iter_ingredients = copy.deepcopy(product['ingredients'])

            # Removing ingredients from the list if they do not have sub-ingredients,
            # nor defined percentage and are not in the OFF taxonomy
            for ingredient in iter_ingredients:
                if ('ingredients' not in ingredient) \
                        and ('percent' not in ingredient) \
                        and ingredient['id'] not in off_taxonomy:
                    product['ingredients'].remove(ingredient)
                    self.removed_unknown_ingredients.append(ingredient['id'])

            # Removing the 'ingredients' key if empty
            if len(product['ingredients']) == 0:
                del product['ingredients']


[docs]def remove_percentage_from_product(product):
    """
    Removes the defined percentage of ingredients.

    Args:
        product (dict):
    """

    for ingredient in product['ingredients']:
        if 'percent' in ingredient:
            del ingredient['percent']

        if 'ingredients' in ingredient:
            remove_percentage_from_product(ingredient)


[docs]def weighted_geometric_mean(values, weights):
    """
    Returns the weighted geometric mean of values.

    Args:
        values (iterable):
        weights (iterable):

    Returns:
        float:
    """

    assert len(values) == len(weights)
    return np.exp(sum([weights[i] * np.log(values[i]) for i in range(len(values))]) /
                  sum([weights[i] for i in range(len(values))]))


[docs]def agribalyse_impact_name_i18n(impact_name):
    """
    Returns the French version of an impact name

    Args:
        impact_name (str):

    Examples:
        >>> agribalyse_impact_name_i18n('Climate change')
        'Changement climatique'
        >>> agribalyse_impact_name_i18n("Appauvrissement de la couche d'ozone")
        'Appauvrissement de la couche d'ozone'
    """

    if impact_name in AGRIBALYSE_IMPACT_CATEGORIES_EN_TO_FR:
        return AGRIBALYSE_IMPACT_CATEGORIES_EN_TO_FR[impact_name]
    elif impact_name in AGRIBALYSE_IMPACT_CATEGORIES_FR:
        return impact_name
    else:
        raise ValueError(f'Unrecognized impact: {impact_name}')