certtools · Aug 23, 2017 · Jul 18, 2017 · Jul 24, 2017 · Jul 25, 2017 · Aug 16, 2017
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,11 +3,23 @@ CHANGELOG
 
 1.1.0
 -----
+### Core
+- Subitems in fields of type `JSONDict` (see below) can be accessed directly. E.g. you can do:
+  event['extra.foo'] = 'bar'
+  event['extra.foo'] # gives 'bar'
+  It is still possible to set and get the field as whole, however this may be removed or changed in the future:
+  event['extra'] = '{"foo": "bar"}'
+  event['extra'] # gives '{"foo": "bar"}'
+  "Old" bots and configurations compatible with 1.0.x do still work.
+  Also, the extra field is now properly exploded when exporting events, analogous to all other fields.
 
 ### Bots
 #### Collectors
 - Mail: New parameters; `sent_from`: filter messages by sender, `sent_to`: filter messages by recipient
 
+### Harmonization
+- Renamed `JSON` to `JSONDict` and added a new type `JSON`. `JSONDict` saves data internally as JSON, but acts like a dictionary. `JSON` accepts any valid JSON.
+
 ### Requirements
 - Requests is no longer a listed as dependency of the core. For depending bots the requirement is noted in their REQUIREMENTS.txt file
 

diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,8 @@ See the changelog for a full list of changes.
 
 1.1.0
 -----
+### Configuration
+A new harmonization type `JSONDict` has been added specifically for the `extra` field. It is highly recommended to change the type of this field.
 
 1.0.0 Stable release
 --------------------

diff --git a/docs/Data-Harmonization.md b/docs/Data-Harmonization.md
@@ -66,6 +66,9 @@ We recognize that ip geolocation is not an exact science and analysis of the abu
 
 Some sources report an internal (NATed) IP address.
 
+### Extra values
+Data which does not fit in the harmonization can be saved in the 'extra' namespace. All keys must begin with `extra.`, there are no other rules on key names and values. The values can be get/set like all other fields.
+
 <a name="fields-list-and-data-types"></a>
 ## Fields List and data types
 

diff --git a/docs/Harmonization-fields.md b/docs/Harmonization-fields.md
@@ -34,7 +34,7 @@ Harmonization field names
 |Event_Description|event_description.text|[String](#string)|A free-form textual description of an abuse event.|
 |Event_Description|event_description.url|[URL](#url)|A description URL is a link to a further description of the the abuse event in question.|
 | |event_hash|[UppercaseString](#uppercasestring)|Computed event hash with specific keys and values that identify a unique event. At present, the hash should default to using the SHA1 function. Please note that for an event hash to be able to match more than one event (deduplication) the receiver of an event should calculate it based on a minimal set of keys and values present in the event. Using for example the observation time in the calculation will most likely render the checksum useless for deduplication purposes.|
-| |extra|[JSON](#json)|All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc.  **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.|
+| |extra|[JSONDict](#jsondict)|All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc.  **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.|
 |Feed|feed.accuracy|[Accuracy](#accuracy)|A float between 0 and 100 that represents how accurate the data in the feed is|
 |Feed|feed.code|[String](#string)|Code name for the feed, e.g. DFGS, HSDAG etc.|
 |Feed|feed.documentation|[String](#string)|A URL or hint where to find the documentation of this feed.|
@@ -204,6 +204,15 @@ Sanitation accepts strings and everything int() accepts.
 
 JSON type.
 
+Sanitation accepts any valid JSON objects.
+
+Valid values are only unicode strings with JSON objects.
+
+
+### JSONDict
+
+JSONDict type.
+
 Sanitation accepts pythons dictionaries and JSON strings.
 
 Valid values are only unicode strings with JSON dictionaries.

diff --git a/intelmq/bin/intelmq_psql_initdb.py b/intelmq/bin/intelmq_psql_initdb.py
@@ -55,7 +55,7 @@ def generate(harmonization_file=HARMONIZATION_CONF_FILE):
             dbtype = 'real'
         elif value['type'] == 'UUID':
             dbtype = 'UUID'
-        elif value['type'] == 'JSON':
+        elif value['type'] in ('JSON', 'JSONDict'):
             dbtype = 'json'
         else:
             raise ValueError('Unknown type %r.' % value['type'])

diff --git a/intelmq/bin/intelmqctl.py b/intelmq/bin/intelmqctl.py
@@ -883,6 +883,10 @@ def check(self):
                         self.logger.error('Invalid regex for type %r: %r.', harm_type_name, str(e))
                         retval = 1
                         continue
+        extra_type = files[HARMONIZATION_CONF_FILE].get('event', {}).get('extra', {}).get('type')
+        if extra_type != 'JSONDict':
+            self.logger.warning("'extra' field needs to be of type 'JSONDict'.")
+            retval = 1
 
         self.logger.info('Checking for bots.')
         for bot_id, bot_config in files[RUNTIME_CONF_FILE].items():

diff --git a/intelmq/bots/parsers/generic/parser_csv.py b/intelmq/bots/parsers/generic/parser_csv.py
@@ -64,7 +64,6 @@ def parse(self, report):
     def parse_line(self, row, report):
         event = self.new_event(report)
 
-        extra = {}
         for key, value in zip(self.columns, row):
             regex = self.column_regex_search.get(key, None)
             if regex:
@@ -85,18 +84,12 @@ def parse_line(self, row, report):
                     value = self.type_translation[value]
                 elif not hasattr(self.parameters, 'type'):
                     continue
-            if key.startswith('extra.'):
-                if value:
-                    extra[key[6:]] = value
-            else:
-                event.add(key, value)
+            event.add(key, value)
 
         if hasattr(self.parameters, 'type')\
                 and "classification.type" not in event:
             event.add('classification.type', self.parameters.type)
         event.add("raw", self.recover_line(row))
-        if extra:
-            event.add('extra', extra)
         yield event
 
     recover_line = ParserBot.recover_line_csv

diff --git a/intelmq/bots/parsers/shadowserver/config.py b/intelmq/bots/parsers/shadowserver/config.py
@@ -919,6 +919,8 @@ def validate_fqdn(value):
         ('extra.', 'system', validate_to_none),
         ('extra.', 'detected_since', validate_to_none),
         ('extra.', 'server', validate_to_none),
+        ('extra.', 'naics', invalidate_zero),
+        ('extra.', 'sic', invalidate_zero),
     ],
     'constant_fields': {
         'classification.type': 'compromised',

diff --git a/intelmq/etc/harmonization.conf b/intelmq/etc/harmonization.conf
@@ -130,7 +130,7 @@
         },
         "extra": {
             "description": "All anecdotal information, which cannot be parsed into the data harmonization elements. E.g. os.name, os.version, etc.  **Note**: this is only intended for mapping any fields which can not map naturally into the data harmonization. It is not intended for extending the data harmonization with your own fields.",
-            "type": "JSON"
+            "type": "JSONDict"
         },
         "feed.accuracy": {
             "description": "A float between 0 and 100 that represents how accurate the data in the feed is",

diff --git a/intelmq/lib/harmonization.py b/intelmq/lib/harmonization.py
@@ -31,8 +31,9 @@
 
 __all__ = ['Base64', 'Boolean', 'ClassificationType', 'DateTime', 'FQDN',
            'Float', 'Accuracy', 'GenericType', 'IPAddress', 'IPNetwork',
-           'Integer', 'JSON', 'LowercaseString', 'Registry', 'String', 'URL',
-           'ASN']
+           'Integer', 'JSON', 'JSONDict', 'LowercaseString', 'Registry',
+           'String', 'URL', 'ASN',
+           ]
 
 
 class GenericType(object):
@@ -594,6 +595,44 @@ class JSON(GenericType):
     """
     JSON type.
 
+    Sanitation accepts any valid JSON objects.
+
+    Valid values are only unicode strings with JSON objects.
+    """
+
+    @staticmethod
+    def is_valid(value, sanitize=False):
+        if sanitize:
+            value = JSON().sanitize(value)
+
+        if not isinstance(value, str):
+            return False
+
+        try:
+            json.loads(value)
+        except ValueError:
+            return False
+
+        return True
+
+    @staticmethod
+    def sanitize(value):
+        if value is None:
+            return None
+        if isinstance(value, (str, bytes)):
+            sanitized = GenericType.sanitize(value)
+            if JSON.is_valid(sanitized):
+                return sanitized
+        try:
+            return GenericType().sanitize(json.dumps(value, sort_keys=True))
+        except TypeError:
+            return None
+
+
+class JSONDict(JSON):
+    """
+    JSONDict type.
+
     Sanitation accepts pythons dictionaries and JSON strings.
 
     Valid values are only unicode strings with JSON dictionaries.
@@ -602,7 +641,7 @@ class JSON(GenericType):
     @staticmethod
     def is_valid(value, sanitize=False):
         if sanitize:
-            value = JSON().sanitize(value)
+            value = JSONDict().sanitize(value)
 
         if not isinstance(value, str):
             return False
@@ -617,19 +656,27 @@ def is_valid(value, sanitize=False):
 
         return False
 
+    @staticmethod
+    def is_valid_subitem(value):
+        return True
+
     @staticmethod
     def sanitize(value):
         if not value:
             return None
         if isinstance(value, (str, bytes)):
             sanitized = GenericType.sanitize(value)
-            if JSON.is_valid(sanitized):
+            if JSONDict.is_valid(sanitized):
                 return sanitized
         try:
             return GenericType().sanitize(json.dumps(value, sort_keys=True))
         except TypeError:
             return None
 
+    @staticmethod
+    def sanitize_subitem(value):
+        return value
+
 
 class LowercaseString(GenericType):
     """

diff --git a/intelmq/lib/message.py b/intelmq/lib/message.py
@@ -4,6 +4,7 @@
 
 Use MessageFactory to get a Message object (types Report and Event).
 """
+import functools
 import hashlib
 import json
 import re
@@ -85,6 +86,8 @@ def serialize(message):
 
 class Message(dict):
 
+    _IGNORED_VALUES = ["", "-", "N/A"]
+
     def __init__(self, message=(), auto=False, harmonization=None):
         try:
             classname = message['__type'].lower()
@@ -102,6 +105,11 @@ def __init__(self, message=(), auto=False, harmonization=None):
                                              expected=VALID_MESSSAGE_TYPES,
                                              docs=HARMONIZATION_CONF_FILE)
 
+        if classname == 'event' and self.harmonization_config['extra']['type'] == 'JSON':
+            warnings.warn("Assuming harmonization type 'JSONDict' for harmonization field 'extra'. "
+                          "This assumption will be removed in version 2.0.", DeprecationWarning)
+            self.harmonization_config['extra']['type'] = 'JSONDict'
+
         super(Message, self).__init__()
         if isinstance(message, dict):
             iterable = message.items()
@@ -114,6 +122,14 @@ def __init__(self, message=(), auto=False, harmonization=None):
     def __setitem__(self, key, value):
         self.add(key, value)
 
+    def __getitem__(self, key):
+        class_name, subitem = self.__get_type_config(key)
+        if class_name['type'] == 'JSONDict' and not subitem:
+            # return extra as string for backwards compatibility
+            return json.dumps(self.to_dict(hierarchical=True)[key.split('.')[0]])
+        else:
+            return super(Message, self).__getitem__(key)
+
     def is_valid(self, key: str, value: str, sanitize: bool=True) -> bool:
         """
         Checks if a value is valid for the key (after sanitation).
@@ -174,7 +190,7 @@ def add(self, key: str, value: str, sanitize: bool=True, force: bool=False,
         if not overwrite and key in self:
             raise exceptions.KeyExists(key)
 
-        if value is None or value in ["", "-", "N/A"]:
+        if value is None or value in self._IGNORED_VALUES:
             if overwrite and key in self:
                 del self[key]
             return
@@ -206,7 +222,19 @@ def add(self, key: str, value: str, sanitize: bool=True, force: bool=False,
             else:
                 return False
 
-        super(Message, self).__setitem__(key, value)
+        class_name, subitem = self.__get_type_config(key)
+        if class_name and class_name['type'] == 'JSONDict' and not subitem:
+            # for backwards compatibility allow setting the extra field as string
+            for extrakey, extravalue in json.loads(value).items():
+                if hasattr(extravalue, '__len__'):
+                    if not len(extravalue):  # ignore empty values
+                        continue
+                if extravalue in self._IGNORED_VALUES:
+                    continue
+                super(Message, self).__setitem__('%s.%s' % (key, extrakey),
+                                                 extravalue)
+        else:
+            super(Message, self).__setitem__(key, value)
         return True
 
     def update(self, other: dict):
@@ -251,17 +279,26 @@ def unserialize(message_string: str):
         message = json.loads(message_string)
         return message
 
+    @functools.lru_cache(maxsize=None)
     def __is_valid_key(self, key: str):
-        if key in self.harmonization_config or key == '__type':
+        try:
+            class_name, subitem = self.__get_type_config(key)
+        except KeyError:
+            return False
+        if key in self.harmonization_config or key == '__type' or subitem:
             return True
         return False
 
     def __is_valid_value(self, key: str, value: str):
         if key == '__type':
             return (True, )
-        config = self.__get_type_config(key)
+        config, subitem = self.__get_type_config(key)
         class_reference = getattr(intelmq.lib.harmonization, config['type'])
-        if not class_reference().is_valid(value):
+        if not subitem:
+            validation = class_reference().is_valid(value)
+        else:
+            validation = class_reference().is_valid_subitem(value)
+        if not validation:
             return (False, 'is_valid returned False.')
         if 'length' in config:
             length = len(str(value))
@@ -277,13 +314,26 @@ def __is_valid_value(self, key: str, value: str):
         return (True, )
 
     def __sanitize_value(self, key: str, value: str):
-        class_name = self.__get_type_config(key)['type']
-        class_reference = getattr(intelmq.lib.harmonization, class_name)
-        return class_reference().sanitize(value)
+        class_name, subitem = self.__get_type_config(key)
+        class_reference = getattr(intelmq.lib.harmonization, class_name['type'])
+        if not subitem:
+            return class_reference().sanitize(value)
+        else:
+            return class_reference().sanitize_subitem(value)
 
+    @functools.lru_cache(maxsize=None)
     def __get_type_config(self, key: str):
-        class_name = self.harmonization_config[key]
-        return class_name
+        if key == '__type':
+            return None, None
+        try:
+            class_name = self.harmonization_config[key]
+        except KeyError:
+            # Could be done recursively in the future if needed
+            class_name = self.harmonization_config[key.split('.')[0]]
+            subitem = True
+        else:
+            subitem = False
+        return class_name, subitem
 
     def __hash__(self):
         return int(self.hash(), 16)
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,8 @@ See the changelog for a full list of changes. @@
 .1.0
     -----
+    ### Configuration
+    A new harmonization type `JSONDict` has been added specifically for the `extra` field. It is highly recommended to change the type of this field.
 .0.0 Stable release
     --------------------
@@ Expand Down @@