clowder-framework · max-zilla · Feb 6, 2023 · Jan 30, 2023 · Jan 30, 2023 · Jan 31, 2023
@@ -25,7 +25,7 @@ class FeedIn(JobFeed):
 
 
 class FeedDB(JobFeed, MongoModel):
-    author: UserOut
+    author: Optional[UserOut] = None
     updated: datetime = Field(default_factory=datetime.utcnow)
 
 

@@ -1,20 +1,31 @@
 from datetime import datetime
 from typing import Optional
 
-from pydantic import Field
+from pydantic import Field, BaseModel
 
 from app.models.mongomodel import MongoModel
 from app.models.pyobjectid import PyObjectId
 from app.models.users import UserOut
 
 
+class FileContentType(BaseModel):
+    """This model describes the content type of a file uploaded to Clowder. A typical example is "text/plain" for .txt.
+    In Clowder v1 extractors, "text/*" syntax is acceptable for wildcard matches. To support this, the content type is
+    split into main ("text") and secondary ("plain") parts so the dynamic matching with * can still be done.
+
+    """
+
+    content_type: str = "N/A"
+    main_type: str = "N/A"
+
+
 class FileVersion(MongoModel):
     version_id: str
     version_num: int = 1
     file_id: PyObjectId
     creator: UserOut
     bytes: int = 0
-    content_type: str = "N/A"
+    content_type: FileContentType = FileContentType()
     created: datetime = Field(default_factory=datetime.utcnow)
 
 
@@ -36,7 +47,7 @@ class FileDB(FileBase):
     views: int = 0
     downloads: int = 0
     bytes: int = 0
-    content_type: str = "N/A"
+    content_type: FileContentType = FileContentType()
 
 
 class FileOut(FileDB):

@@ -3,17 +3,6 @@
 from typing import Optional, List
 
 
-# TODO: may eventually be split by index (resource type)
-class SearchIndexContents(BaseModel):
-    """This describes what is indexed in Elasticsearch for a given resource."""
-
-    id: str
-    name: str
-    creator: str  # currently just email
-    created: datetime
-    download: int
-
-
 class SearchCriteria(BaseModel):
     field: str
     operator: str = "=="

@@ -5,6 +5,8 @@
 from pymongo import MongoClient
 
 from app.config import settings
+from app.models.search import SearchCriteria
+from app.routers.feeds import FeedIn, FeedListener, FeedOut, FeedDB, associate_listener
 from app.models.listeners import EventListenerDB, EventListenerOut, ExtractorInfo
 
 logging.basicConfig(level=logging.INFO)
@@ -51,6 +53,48 @@ def callback(ch, method, properties, body):
         found = db["listeners"].find_one({"_id": new_extractor.inserted_id})
         extractor_out = EventListenerOut.from_mongo(found)
         logger.info("New extractor registered: " + extractor_name)
+
+        # Assign MIME-based listener if needed
+        if extractor_out.properties and extractor_out.properties.process:
+            process = extractor_out.properties.process
+            if "file" in process:
+                # Create a MIME-based feed for this v1 extractor
+                criteria_list = []
+                for mime in process["file"]:
+                    main_type = mime.split("/")[0] if mime.find("/") > -1 else mime
+                    sub_type = mime.split("/")[1] if mime.find("/") > -1 else None
+                    if sub_type:
+                        if sub_type == "*":
+                            # If a wildcard, just match on main type
+                            criteria_list.append(
+                                SearchCriteria(
+                                    field="content_type_main", value=main_type
+                                )
+                            )
+                        else:
+                            # Otherwise match the whole string
+                            criteria_list.append(
+                                SearchCriteria(field="content_type", value=mime)
+                            )
+                    else:
+                        criteria_list.append(
+                            SearchCriteria(field="content_type", value=mime)
+                        )
+
+                # TODO: Who should the author be for an auto-generated feed? Currently None.
+                new_feed = FeedDB(
+                    name=extractor_name,
+                    search={
+                        "index_name": "file",
+                        "criteria": criteria_list,
+                        "mode": "or",
+                    },
+                    listeners=[
+                        FeedListener(listener_id=extractor_out.id, automatic=True)
+                    ],
+                )
+                db["feeds"].insert_one(new_feed.to_mongo())
+
         return extractor_out
 
 

@@ -17,7 +17,6 @@
     FeedDB,
     FeedOut,
 )
-from app.models.search import SearchIndexContents
 from app.search.connect import check_search_result
 from app.rabbitmq.listeners import submit_file_job
 

@@ -31,7 +31,7 @@
     update_record,
     delete_document_by_query,
 )
-from app.models.files import FileIn, FileOut, FileVersion, FileDB
+from app.models.files import FileIn, FileOut, FileVersion, FileContentType, FileDB
 from app.models.users import UserOut
 from app.routers.feeds import check_feed_listeners
 from app.keycloak_auth import get_user, get_current_user, get_token
@@ -70,6 +70,8 @@ async def add_file_entry(
     if content_type is None:
         content_type = mimetypes.guess_type(file_db.name)
         content_type = content_type[0] if len(content_type) > 1 else content_type
+    type_main = content_type.split("/")[0] if type(content_type) is str else "N/A"
+    content_type_obj = FileContentType(content_type=content_type, main_type=type_main)
 
     # Use unique ID as key for Minio and get initial version ID
     response = fs.put_object(
@@ -87,7 +89,7 @@ async def add_file_entry(
     file_db.version_id = version_id
     file_db.version_num = 1
     file_db.bytes = bytes
-    file_db.content_type = content_type if type(content_type) is str else "N/A"
+    file_db.content_type = content_type_obj
     await db["files"].replace_one({"_id": ObjectId(new_file_id)}, file_db.to_mongo())
     file_out = FileOut(**file_db.dict())
 
@@ -97,7 +99,7 @@ async def add_file_entry(
         file_id=new_file_id,
         creator=user,
         bytes=bytes,
-        content_type=file_db.content_type,
+        content_type=content_type_obj,
     )
     await db["file_versions"].insert_one(new_version.to_mongo())
 
@@ -110,7 +112,8 @@ async def add_file_entry(
         "dataset_id": str(file_db.dataset_id),
         "folder_id": str(file_db.folder_id),
         "bytes": file_db.bytes,
-        "content_type": file_db.content_type,
+        "content_type": content_type_obj.content_type,
+        "content_type_main": content_type_obj.main_type,
     }
     insert_record(es, "file", doc, file_db.id)
 
@@ -210,7 +213,6 @@ async def update_file(
                 {"resource.resource_id": ObjectId(updated_file.id)}
             )
         ) is not None:
-            print("updating metadata")
             doc = {
                 "doc": {
                     "name": updated_file.name,